### Data processing

In [23]:
import pandas as pd
import time
import re
# !pip install nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ThorNørgaardEriksen\AppData\Roaming\nltk_data
[nltk_data]     ...
[nltk_data]   Package stopwords is already up-to-date!


### Load data

In [26]:
# Load dataframes
df_politics_Israel = pd.read_csv('data/politics_Israel_comments.csv')
df_politics_Palestine = pd.read_csv('data/politics_Palestine_comments.csv')
df_worldnews_Israel = pd.read_csv('data/worldnews_Israel_comments.csv')
df_worldnews_Palestine = pd.read_csv('data/worldnews_Palestine_comments.csv')
df_Israel_Israel = pd.read_csv('data/Israel_Israel_comments.csv')
df_Israel_Palestine = pd.read_csv('data/Israel_Palestine_comments.csv')
df_Palestine_Israel = pd.read_csv('data/Palestine_Israel_comments.csv')
df_Palestine_Palestine = pd.read_csv('data/Palestine_Palestine_comments.csv')

# Concatenate the DataFrames
df_combined = pd.concat([
    df_politics_Israel, df_politics_Palestine, 
    df_worldnews_Israel, df_worldnews_Palestine, 
    df_Israel_Israel, df_Israel_Palestine, 
    df_Palestine_Israel, df_Palestine_Palestine
])

# Print initial shape
print(f"Combined DataFrame shape before processing: {df_combined.shape}")

# Store original text in a new column for reference
df_combined['comment_original'] = df_combined['comment_text']

# Remove duplicate comments based on 'comment_text'
df_combined = df_combined.drop_duplicates(subset='comment_text')

# Remove comments with 1 or fewer upvotes
df_combined = df_combined[df_combined['upvotes'] > 1]

# Text preprocessing steps applied to 'comment_text'
# Remove URLs
df_combined['comment_text'] = df_combined['comment_text'].apply(lambda x: re.sub(r'http\S+|www.\S+', '', x))

# Lowercase conversion
df_combined['comment_text'] = df_combined['comment_text'].str.lower()

# Remove special characters except ".", "!","?", and ","
df_combined['comment_text'] = df_combined['comment_text'].apply(lambda x: re.sub(r'[^a-zA-Z0-9.,!?]', ' ', x))

# Renaming
df_combined.rename(columns={'comment_text': 'comment_processed'}, inplace=True)

# Remove row if processed test is below 4 words
df_combined = df_combined[df_combined['comment_processed'].apply(lambda x: len(x.split()) > 3)]

# New column without stopwords
stop_words = set(stopwords.words('english'))
df_combined['comment_no_stopwords'] = df_combined['comment_processed'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Reorder columns
df_combined = df_combined[['created_date', 'subreddit_id', 'search_query', 'post_id', 'comment_id', 
                           'post_title', 'comment_original', 'comment_processed', 'comment_no_stopwords', 'upvotes', 
                           'post_url', 'comment_url']]

# Reset index
df_combined.reset_index(drop=True, inplace=True)

# Print shape after processing
print(f"Combined DataFrame shape after processing: {df_combined.shape}")

# Display final DataFrame
df_combined.head(2)


Combined DataFrame shape before processing: (14585, 10)
Combined DataFrame shape after processing: (11600, 12)


Unnamed: 0,created_date,subreddit_id,search_query,post_id,comment_id,post_title,comment_original,comment_processed,comment_no_stopwords,upvotes,post_url,comment_url
0,2024-10-27 15:32:33,politics,Israel,1gdcy25,lu0s0st,Bernie Sanders to voters skipping presidential...,"News flash, Trump is worse on every single vot...","news flash, trump is worse on every single vot...","news flash, trump worse every single voting is...",8981,https://www.reddit.com/r/politics/comments/1gd...,https://www.reddit.com/r/politics/comments/1gd...
1,2024-10-27 15:08:10,politics,Israel,1gdcy25,lu0nkvc,Bernie Sanders to voters skipping presidential...,Protest non-voters are some of the worst type ...,protest non voters are some of the worst type ...,protest non voters worst type privileged peopl...,4186,https://www.reddit.com/r/politics/comments/1gd...,https://www.reddit.com/r/politics/comments/1gd...


Example to compare original comment with processed comment

In [27]:
# print out a comment URL and text
print(df_combined['comment_url'].iloc[0])
print(df_combined['comment_original'].iloc[0])
print(df_combined['comment_processed'].iloc[0])

https://www.reddit.com/r/politics/comments/1gdcy25/bernie_sanders_to_voters_skipping_presidential/lu0s0st
News flash, Trump is worse on every single voting issue. Every. Single. One.
news flash, trump is worse on every single voting issue. every. single. one.


### Apply sentiment and Lix

In [61]:
%run sentiment_analysis.ipynb
%run lix_calc_function.ipynb # Use this once we have made a Lix_Calc that should only contain the function

#Apply function "lix_calc3"
df_combined['lix_score'] = df_combined['comment_processed'].apply(lix_calc_used)

# Apply function "calculate_sentiment"
df_combined['sentiment_score'] = df_combined['comment_processed'].apply(calculate_sentiment)


In [62]:
df_combined.head(2)

Unnamed: 0,created_date,subreddit_id,search_query,post_id,comment_id,post_title,comment_original,comment_processed,comment_no_stopwords,upvotes,post_url,comment_url,lix_score,sentiment_score
0,2024-10-27 15:32:33,politics,Israel,1gdcy25,lu0s0st,Bernie Sanders to voters skipping presidential...,"News flash, Trump is worse on every single vot...","news flash, trump is worse on every single vot...","news flash, trump worse every single voting is...",8981,https://www.reddit.com/r/politics/comments/1gd...,https://www.reddit.com/r/politics/comments/1gd...,3.25,0.0
1,2024-10-27 15:08:10,politics,Israel,1gdcy25,lu0nkvc,Bernie Sanders to voters skipping presidential...,Protest non-voters are some of the worst type ...,protest non voters are some of the worst type ...,protest non voters worst type privileged peopl...,4186,https://www.reddit.com/r/politics/comments/1gd...,https://www.reddit.com/r/politics/comments/1gd...,40.104651,0.0


### Export new dataframes

In [44]:
# Now we export them to CSV
subreddits = df_combined['subreddit_id'].unique()
for subreddit in subreddits:
    df_combined[df_combined['subreddit_id'] == subreddit].to_csv(f'data/{subreddit}_w_lix_sentiment.csv', index=False)