# Reddit Comment Sentiment Preprocessing
## Quick Overview
This notebook preprocesses a dataset of Reddit comments for sentiment analysis. The focus is on cleaning and preparing the data for sentiment scoring.

## Steps
- Data Loading: Efficiently load Reddit comments in chunks.
- Text Cleaning: Remove irrelevant text components.
- Sentiment Analysis: Perform nltk's SentimentIntensityAnalyzer to all comments.
- Batch Processing: Handle data in manageable batches for system efficiency.
- Calculated the weighted sentiment score, which is calculate by multiply the sentiment score by the number of upvotes. if the number of upvotes is 0, use score value 0.01 instead.

In [None]:
import pandas as pd
import numpy as np

In [2]:
from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

def calculate_sentiment_scores(text):
    scores = sia.polarity_scores(text)
    return scores['neg'], scores['neu'], scores['pos'], scores['compound']

def calculate_weighted_sentiment(score, sentiment_score):
    return score * sentiment_score if score != 0 else 0.01 * sentiment_score


In [3]:
chunk_count = 0

chunk_size = 100000  
processed_data = []

for chunk in pd.read_csv('text_comments.csv', chunksize=chunk_size, on_bad_lines='skip',lineterminator='\n',nrows=10000000):
    # Drop rows where 'body' is null
    chunk = chunk.dropna(subset=['body'])

    # Convert 'created_utc' to standard datetime
    chunk['created_utc'] = pd.to_datetime(chunk['created_utc'], unit='s')

    # Calculate sentiment scores
    chunk[['negative', 'neutral', 'positive', 'compound']] = chunk['body'].apply(lambda x: calculate_sentiment_scores(x)).apply(pd.Series)
    
    # Calculate weighted sentiment scores
    for sentiment in ['negative', 'neutral', 'positive', 'compound']:
        chunk[f'weighted_{sentiment}'] = chunk.apply(lambda row: calculate_weighted_sentiment(row['score'], row[sentiment]), axis=1)
    
    # Keep only necessary columns
    chunk = chunk[['id', 'score', 'link_id', 'subreddit', 'created_utc', 'negative', 'neutral', 'positive', 'compound', 'weighted_negative', 'weighted_neutral', 'weighted_positive', 'weighted_compound']]
    
    processed_data.append(chunk)
    chunk_count += 1
    print(f"Chunk {chunk_count} processed")

# Concatenate all processed chunks
final_df = pd.concat(processed_data)

# Save the final DataFrame to a new CSV file
final_df.to_csv('processed_comments1.csv', index=False)



Chunk 1 processed
Chunk 2 processed
Chunk 3 processed
Chunk 4 processed
Chunk 5 processed
Chunk 6 processed
Chunk 7 processed
Chunk 8 processed
Chunk 9 processed
Chunk 10 processed
Chunk 11 processed
Chunk 12 processed
Chunk 13 processed
Chunk 14 processed
Chunk 15 processed
Chunk 16 processed
Chunk 17 processed
Chunk 18 processed
Chunk 19 processed
Chunk 20 processed
Chunk 21 processed
Chunk 22 processed
Chunk 23 processed
Chunk 24 processed
Chunk 25 processed
Chunk 26 processed
Chunk 27 processed
Chunk 28 processed
Chunk 29 processed
Chunk 30 processed
Chunk 31 processed
Chunk 32 processed
Chunk 33 processed
Chunk 34 processed
Chunk 35 processed
Chunk 36 processed
Chunk 37 processed
Chunk 38 processed
Chunk 39 processed
Chunk 40 processed
Chunk 41 processed
Chunk 42 processed
Chunk 43 processed
Chunk 44 processed
Chunk 45 processed
Chunk 46 processed
Chunk 47 processed
Chunk 48 processed
Chunk 49 processed
Chunk 50 processed
Chunk 51 processed
Chunk 52 processed
Chunk 53 processed
Ch

In [4]:

# Number of rows to skip (10,000,000 already processed) and number of rows to read
rows_to_skip = 10000000
rows_to_read = 10000000

# Read the CSV file in chunks, skipping the first 10,000,000 rows
for chunk in pd.read_csv('text_comments.csv', chunksize=chunk_size, skiprows=range(1, rows_to_skip), nrows=rows_to_read, on_bad_lines='skip', lineterminator='\n'):
    # Drop rows where 'body' is null
    chunk = chunk.dropna(subset=['body'])

    # Convert 'created_utc' to standard datetime
    chunk['created_utc'] = pd.to_datetime(chunk['created_utc'], unit='s')

    # Calculate sentiment scores
    chunk[['negative', 'neutral', 'positive', 'compound']] = chunk['body'].apply(lambda x: calculate_sentiment_scores(x)).apply(pd.Series)
    
    # Calculate weighted sentiment scores
    for sentiment in ['negative', 'neutral', 'positive', 'compound']:
        chunk[f'weighted_{sentiment}'] = chunk.apply(lambda row: calculate_weighted_sentiment(row['score'], row[sentiment]), axis=1)
    
    # Keep only necessary columns
    chunk = chunk[['id', 'score', 'link_id', 'subreddit', 'created_utc', 'negative', 'neutral', 'positive', 'compound', 'weighted_negative', 'weighted_neutral', 'weighted_positive', 'weighted_compound']]
    
    processed_data.append(chunk)
    chunk_count += 1
    print(f"Chunk {chunk_count} processed")

# Concatenate all processed chunks
final_df = pd.concat(processed_data)

# Save the final DataFrame to a new CSV file
final_df.to_csv('processed_comments2.csv', index=False)


Chunk 101 processed
Chunk 102 processed
Chunk 103 processed
Chunk 104 processed
Chunk 105 processed
Chunk 106 processed
Chunk 107 processed
Chunk 108 processed
Chunk 109 processed
Chunk 110 processed
Chunk 111 processed
Chunk 112 processed
Chunk 113 processed
Chunk 114 processed
Chunk 115 processed
Chunk 116 processed
Chunk 117 processed
Chunk 118 processed
Chunk 119 processed
Chunk 120 processed
Chunk 121 processed
Chunk 122 processed
Chunk 123 processed
Chunk 124 processed
Chunk 125 processed
Chunk 126 processed
Chunk 127 processed
Chunk 128 processed
Chunk 129 processed
Chunk 130 processed
Chunk 131 processed
Chunk 132 processed
Chunk 133 processed
Chunk 134 processed
Chunk 135 processed
Chunk 136 processed
Chunk 137 processed
Chunk 138 processed
Chunk 139 processed
Chunk 140 processed
Chunk 141 processed
Chunk 142 processed
Chunk 143 processed
Chunk 144 processed
Chunk 145 processed
Chunk 146 processed
Chunk 147 processed
Chunk 148 processed
Chunk 149 processed
Chunk 150 processed


In [5]:

# Number of rows to skip (10,000,000 already processed) and number of rows to read
rows_to_skip = 20000000
rows_to_read = 10000000

# Read the CSV file in chunks, skipping the first 20,000,000 rows
for chunk in pd.read_csv('text_comments.csv', chunksize=chunk_size, skiprows=range(1, rows_to_skip), nrows=rows_to_read, on_bad_lines='skip', lineterminator='\n'):
    # Drop rows where 'body' is null
    chunk = chunk.dropna(subset=['body'])

    # Convert 'created_utc' to standard datetime
    chunk['created_utc'] = pd.to_datetime(chunk['created_utc'], unit='s')

    # Calculate sentiment scores
    chunk[['negative', 'neutral', 'positive', 'compound']] = chunk['body'].apply(lambda x: calculate_sentiment_scores(x)).apply(pd.Series)
    
    # Calculate weighted sentiment scores
    for sentiment in ['negative', 'neutral', 'positive', 'compound']:
        chunk[f'weighted_{sentiment}'] = chunk.apply(lambda row: calculate_weighted_sentiment(row['score'], row[sentiment]), axis=1)
    
    # Keep only necessary columns
    chunk = chunk[['id', 'score', 'link_id', 'subreddit', 'created_utc', 'negative', 'neutral', 'positive', 'compound', 'weighted_negative', 'weighted_neutral', 'weighted_positive', 'weighted_compound']]
    
    processed_data.append(chunk)
    chunk_count += 1
    print(f"Chunk {chunk_count} processed")

# Concatenate all processed chunks
final_df = pd.concat(processed_data)

# Save the final DataFrame to a new CSV file
final_df.to_csv('processed_comments3.csv', index=False)


Chunk 201 processed
Chunk 202 processed
Chunk 203 processed
Chunk 204 processed
Chunk 205 processed
Chunk 206 processed
Chunk 207 processed
Chunk 208 processed
Chunk 209 processed
Chunk 210 processed
Chunk 211 processed
Chunk 212 processed
Chunk 213 processed
Chunk 214 processed
Chunk 215 processed
Chunk 216 processed
Chunk 217 processed
Chunk 218 processed
Chunk 219 processed
Chunk 220 processed
Chunk 221 processed
Chunk 222 processed
Chunk 223 processed
Chunk 224 processed
Chunk 225 processed
Chunk 226 processed
Chunk 227 processed
Chunk 228 processed
Chunk 229 processed
Chunk 230 processed
Chunk 231 processed
Chunk 232 processed
Chunk 233 processed
Chunk 234 processed
Chunk 235 processed
Chunk 236 processed
Chunk 237 processed
Chunk 238 processed
Chunk 239 processed
Chunk 240 processed
Chunk 241 processed
Chunk 242 processed
Chunk 243 processed
Chunk 244 processed
Chunk 245 processed
Chunk 246 processed
Chunk 247 processed
Chunk 248 processed
Chunk 249 processed
Chunk 250 processed


In [7]:

# Number of rows to skip (10,000,000 already processed) and number of rows to read
rows_to_skip = 30000000
rows_to_read = 10000000

# Read the CSV file in chunks, skipping the first 30,000,000 rows
for chunk in pd.read_csv('text_comments.csv', chunksize=chunk_size, skiprows=range(1, rows_to_skip), nrows=rows_to_read, on_bad_lines='skip', lineterminator='\n'):
    # Drop rows where 'body' is null
    chunk = chunk.dropna(subset=['body'])

    # Convert 'created_utc' to standard datetime
    chunk['created_utc'] = pd.to_datetime(chunk['created_utc'], unit='s')

    # Calculate sentiment scores
    chunk[['negative', 'neutral', 'positive', 'compound']] = chunk['body'].apply(lambda x: calculate_sentiment_scores(x)).apply(pd.Series)
    
    # Calculate weighted sentiment scores
    for sentiment in ['negative', 'neutral', 'positive', 'compound']:
        chunk[f'weighted_{sentiment}'] = chunk.apply(lambda row: calculate_weighted_sentiment(row['score'], row[sentiment]), axis=1)
    
    # Keep only necessary columns
    chunk = chunk[['id', 'score', 'link_id', 'subreddit', 'created_utc', 'negative', 'neutral', 'positive', 'compound', 'weighted_negative', 'weighted_neutral', 'weighted_positive', 'weighted_compound']]
    
    processed_data.append(chunk)
    chunk_count += 1
    print(f"Chunk {chunk_count} processed")

# Concatenate all processed chunks
final_df = pd.concat(processed_data)

# Save the final DataFrame to a new CSV file
final_df.to_csv('processed_comments3.csv', index=False)


Chunk 301 processed
Chunk 302 processed
Chunk 303 processed
Chunk 304 processed
Chunk 305 processed
Chunk 306 processed
Chunk 307 processed
Chunk 308 processed
Chunk 309 processed
Chunk 310 processed
Chunk 311 processed
Chunk 312 processed
Chunk 313 processed
Chunk 314 processed
Chunk 315 processed
Chunk 316 processed
Chunk 317 processed
Chunk 318 processed
Chunk 319 processed
Chunk 320 processed
Chunk 321 processed
Chunk 322 processed
Chunk 323 processed
Chunk 324 processed
Chunk 325 processed
Chunk 326 processed
Chunk 327 processed
Chunk 328 processed
Chunk 329 processed
Chunk 330 processed
Chunk 331 processed
Chunk 332 processed
Chunk 333 processed
Chunk 334 processed
Chunk 335 processed
Chunk 336 processed
Chunk 337 processed
Chunk 338 processed
Chunk 339 processed
Chunk 340 processed
Chunk 341 processed
Chunk 342 processed
Chunk 343 processed
Chunk 344 processed
Chunk 345 processed
Chunk 346 processed
Chunk 347 processed
Chunk 348 processed
Chunk 349 processed
Chunk 350 processed


In [8]:

# Number of rows to skip (10,000,000 already processed) and number of rows to read
rows_to_skip = 40000000
rows_to_read = 10000000

# Read the CSV file in chunks, skipping the first 40,000,000 rows
for chunk in pd.read_csv('text_comments.csv', chunksize=chunk_size, skiprows=range(1, rows_to_skip), nrows=rows_to_read, on_bad_lines='skip', lineterminator='\n'):
    # Drop rows where 'body' is null
    chunk = chunk.dropna(subset=['body'])

    # Convert 'created_utc' to standard datetime
    chunk['created_utc'] = pd.to_datetime(chunk['created_utc'], unit='s')

    # Calculate sentiment scores
    chunk[['negative', 'neutral', 'positive', 'compound']] = chunk['body'].apply(lambda x: calculate_sentiment_scores(x)).apply(pd.Series)
    
    # Calculate weighted sentiment scores
    for sentiment in ['negative', 'neutral', 'positive', 'compound']:
        chunk[f'weighted_{sentiment}'] = chunk.apply(lambda row: calculate_weighted_sentiment(row['score'], row[sentiment]), axis=1)
    
    # Keep only necessary columns
    chunk = chunk[['id', 'score', 'link_id', 'subreddit', 'created_utc', 'negative', 'neutral', 'positive', 'compound', 'weighted_negative', 'weighted_neutral', 'weighted_positive', 'weighted_compound']]
    
    processed_data.append(chunk)
    chunk_count += 1
    print(f"Chunk {chunk_count} processed")

# Concatenate all processed chunks
final_df = pd.concat(processed_data)

# Save the final DataFrame to a new CSV file
final_df.to_csv('processed_comments4.csv', index=False)


Chunk 401 processed
Chunk 402 processed
Chunk 403 processed
Chunk 404 processed
Chunk 405 processed
Chunk 406 processed
Chunk 407 processed
Chunk 408 processed
