In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/steam-reviews-2021/steam_reviews.csv


In [2]:
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [3]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import time
import csv

vader_analyzer = SentimentIntensityAnalyzer()

chunksize = 1000000
sentiment_file_path = 'english_sentiment.csv'
file_path = '/kaggle/input/steam-reviews-2021/steam_reviews.csv'

# sentiment analysis using VADER
def vader_sentiment_analysis(text):
    if pd.isna(text):
        return 'neutral'
    score = vader_analyzer.polarity_scores(text)['compound']
    if score >= 0.05:
        return 'positive'
    elif score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

# Start the timer
start_time = time.time()

# Process the file in chunks
for i, chunk in enumerate(pd.read_csv(file_path, chunksize=chunksize, encoding='utf-8', 
                                      quoting=csv.QUOTE_MINIMAL, escapechar='\\', 
                                      on_bad_lines='skip', low_memory=False, dtype=str)):

    chunk = chunk[chunk['language'] == 'english'].copy()
    chunk['sentiment'] = chunk['review'].apply(lambda x: vader_sentiment_analysis(x))
    sentiment_chunk = chunk[['review_id', 'review', 'sentiment']]

    # Log 
    elapsed_time = time.time() - start_time
    print(f"\n--- Chunk {i + 1} processed ---")
    print(f"Time taken for this chunk: {elapsed_time:.2f} seconds")
    print(f"Sample of sentiment data:\n{sentiment_chunk.head()}\n")

    # Overwrite original file
    mode = 'w' if i == 0 else 'a' # w: "write" for the first chunk, a "append" for the next
    header = i == 0
    sentiment_chunk[['review_id','review', 'sentiment']].to_csv(sentiment_file_path, mode=mode, header=header, index=False)

# Calculate total time taken
total_time = time.time() - start_time
print(f"\n=== Total time taken for processing all chunks: {total_time:.2f} seconds ===")


--- Chunk 1 processed ---
Time taken for this chunk: 286.05 seconds
Sample of sentiment data:
   review_id                                             review sentiment
3   85184605  One of the best RPG's of all time, worthy of a...  positive
5   85184171             good story, good graphics. lots to do.  positive
6   85184064                                           dis gud,   neutral
18  85180436  favorite game of all time cant wait for the Ne...  positive
20  85179753                          Why wouldn't you get this   neutral


--- Chunk 2 processed ---
Time taken for this chunk: 551.36 seconds
Sample of sentiment data:
        review_id                          review sentiment
1000000  71818717                             fun  positive
1000002  71818514                         plese\n   neutral
1000005  71818242                       very good  positive
1000006  71818234                       good game  positive
1000007  71818218  Hm.. It's a good game I think,  positive


---