<a href="https://colab.research.google.com/github/gemammercado/IsraelPalestineResearch/blob/main/Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import pandas as pd
import re
import zipfile
import nltk
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from google.colab import drive
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from IPython import get_ipython
from IPython.display import display
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

os.environ["WANDB_DISABLED"] = "true"

os.environ['SHELL'] = '/opt/homebrew/bin/bash'
nltk.download('vader_lexicon')


drive.mount('/content/drive')


csv_path = '/content/drive/MyDrive/bert_predictions.csv'
df = pd.read_csv(csv_path)

"""!pip uninstall nltk -y # Uninstall NLTK
!pip install --upgrade --force-reinstall nltk # Reinstall NLTK

import nltk
nltk.download('vader_lexicon') # Download vader_lexicon"""


Now we have a filtered_df and csv of the final data we are going to use.

In [None]:
sia = SentimentIntensityAnalyzer()

vader_sample_size = 10000
bert_sample_size = 250000

vader_df = df.sample(n=vader_sample_size, random_state=42)

remaining_df = df.drop(vader_df.index)

bert_df = remaining_df.sample(n=bert_sample_size, random_state=42)

print(f"VADER subset size: {len(vader_df)}")
print(f"BERT subset size: {len(bert_df)}")

In [None]:
def clean_text(self_text):
    text = re.sub(r"http\S+", "", self_text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"[^A-Za-z0-9\s]", "", text)
    text = self_text.lower().strip()
    return text

vader_df['clean_text'] = vader_df['self_text'].apply(lambda x: clean_text(str(x)))

print(vader_df.head())

In [None]:
bert_df['clean_text'] = bert_df['self_text'].apply(lambda x: clean_text(str(x)))

print(bert_df.head())

In [None]:
vader_df['vader_sentiment'] = vader_df['clean_text'].apply(lambda x: sia.polarity_scores(x)['compound'])

vader_df['sentiment_label'] = vader_df['vader_sentiment'].apply(lambda score: 'extreme negative' if score <-0.5 else
                                                                  'negative' if -0.5<= score < -.05 else
                                                                  'positive' if score >=0.05 else
                                                                  'neutral')

In [None]:
print(vader_df[['clean_text', 'vader_sentiment', 'sentiment_label']].head())

In [None]:
train_df, test_df = train_test_split(vader_df, test_size=0.2, random_state=42)

label_map = {'extreme negative': 0, 'negative':1, 'neutral': 2, 'positive':3}
train_labels = train_df['sentiment_label'].map(label_map).tolist()
test_labels = test_df['sentiment_label'].map(label_map).tolist()

In [None]:
print(train_df.columns)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_data(texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

train_encodings = tokenize_data(train_df['clean_text'].tolist())
test_encodings = tokenize_data(test_df['clean_text'].tolist())

In [None]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.encodings.input_ids)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        return item


In [None]:
train_dataset = SentimentDataset(train_encodings, train_labels)
test_dataset = SentimentDataset(test_encodings, test_labels)

Define and train BERT

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = len(label_map))

training_args = TrainingArguments(
    output_dir = './results',
    save_steps = 500,
    save_total_limit = 2,
    evaluation_strategy = "epoch",
    learning_rate = 2e-5,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs=3,
    weight_decay = 0.01
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset
)


In [None]:
# Save model and training state after each epoch
trainer.save_model('/content/drive/MyDrive/bert_training/bert_classifier')
trainer.save_state()

In [None]:
import torch
torch.save(train_encodings, '/content/drive/MyDrive/train_encodings.pt')
torch.save(test_encodings, '/content/drive/MyDrive/test_encodings.pt')
torch.save(train_labels, '/content/drive/MyDrive/train_labels.pt')
torch.save(test_labels, '/content/drive/MyDrive/test_labels.pt')


In [None]:
trainer.train()

In [None]:
trainer.train(resume_from_checkpoint='/content/drive/MyDrive/bert_training/bert_classifier/checkpoint-2000')

In [None]:
#Load model
model_path = '/content/drive/MyDrive/bert_training/bert_classifier'
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

In [None]:
remaining_encodings = tokenize_data(bert_df['clean_text'].tolist())
remaining_dataset = SentimentDataset(remaining_encodings)

In [None]:
training_args = TrainingArguments(
    per_device_eval_batch_size=128,
    output_dir="./results"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

predictions = trainer.predict(remaining_dataset)

predicted_labels = predictions.predictions.argmax(axis=1)

print("Predictions complete!")


In [None]:
reverse_label_map = {v: k for k, v in label_map.items()}

predicted_indices = predicted_labels

predicted_labels = [reverse_label_map[idx] for idx in predicted_indices]

bert_df['predicted_label'] = predicted_labels

# Save to CSV
output_csv_path = '/content/drive/MyDrive/bert_predictions.csv'
bert_df.to_csv(output_csv_path, index=False)
print(f"Predictions saved to: {output_csv_path}")

In [None]:
import matplotlib.pyplot as plt

#analysis

print(df['predicted_label'].value_counts())

#visualize
df['predicted_label'].value_counts().plot(kind='bar')
plt.xlabel('Sentiment Distribution')
plt.ylabel('Frequency')
plt.title('BERT Sentiment Analysis')
plt.show()

In [None]:
# Map sentiment labels to numeric values
label_map = {0: 'extreme negative', 1: 'negative', 2: 'neutral', 3:'positive'}
df['predicted_label'] = df['predicted_label'].map(label_map)


In [None]:
print(df.columns)
print(df.head())

In [None]:
import matplotlib.ticker as mticker

#group by subreddit and calculate proportion of extreme sentiment
subreddit_extremism = df.groupby('subreddit')['predicted_label'].value_counts(normalize=True).unstack().fillna(0)

subreddit_extremism = subreddit_extremism.reset_index()
plt.figure(figsize=(10, 6))
subreddit_extremism_melted = subreddit_extremism.melt(id_vars='subreddit', var_name='Sentiment', value_name='Proportion')
custom_colors = ['#A92222', '#EC1414', '#B2B2B2', '#0000']  # Replace with your desired colors

sns.barplot(
    data=subreddit_extremism_melted,
    x='subreddit',
    y='Proportion',
    hue='Sentiment',
    palette=custom_colors
)

plt.gca().yaxis.set_major_formatter(mticker.FuncFormatter(lambda y, _: f'{y * 100:.0f}%'))

plt.title('Proportion of Sentiments by Subreddit')
plt.xlabel('Subreddit')
plt.ylabel('Proportion')
plt.xticks(rotation=45)
plt.legend(title='Sentiment')
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

palette = sns.color_palette("Set2", n_colors=df['predicted_label'].nunique())

plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='subreddit', y='score', hue='predicted_label', palette=palette)

plt.yscale('log')

plt.title('Sentiment Score Distribution by Subreddit', fontsize=16)
plt.xlabel('Subreddit', fontsize=12)
plt.ylabel('Score (Log Scale)', fontsize=12)
plt.xticks(rotation=45)
plt.legend(title='Sentiment', loc='upper left')

plt.tight_layout()

plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df['created_time'] = pd.to_datetime(df['created_time'])
df['month'] = df['created_time'].dt.to_period('M').astype(str)

sentiment_counts = df.groupby(['month', 'subreddit', 'predicted_label']).size().unstack(fill_value=0)

sentiment_proportions = sentiment_counts.div(sentiment_counts.sum(axis=1), axis=0)

sentiment_proportions = sentiment_proportions.reset_index()

df_melted = sentiment_proportions.melt(id_vars=['month', 'subreddit'],
                                       value_vars=['extreme negative', 'negative', 'neutral', 'positive'],
                                       var_name='Sentiment',
                                       value_name='Proportion')

plt.figure(figsize=(12, 6))

custom_colors = {
    'extreme negative': '#A92222',
    'negative': '#EC1414',
    'neutral': '#B2B2B2',
    'positive': '#020405'
}

for sentiment, color in custom_colors.items():
    linewidth = 2.5 if sentiment in ['extreme negative', 'negative'] else 1.5
    sns.lineplot(
        data=df_melted[df_melted['Sentiment'] == sentiment],
        x='month',
        y='Proportion',
        label=sentiment,
        color=color,
        linewidth=linewidth,
        marker='o',
        ci=None
    )

plt.gca().yaxis.set_major_formatter(mticker.FuncFormatter(lambda y, _: f'{y * 100:.0f}%'))

plt.title('Sentiment Over Time', fontsize=16)
plt.xlabel('Month', fontsize=12)
plt.ylabel('Proportion of Sentiment', fontsize=12)
plt.xticks(rotation=45)
plt.legend(title='Sentiment', loc='upper left')
plt.ylim(0, 0.5)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df['created_time'] = pd.to_datetime(df['created_time'])
df['month'] = df['created_time'].dt.to_period('M').astype(str)
sentiment_counts = df.groupby(['month', 'predicted_label']).size().unstack(fill_value=0)

positive_count = sentiment_counts.get('positive', 0)
negative_count = sentiment_counts.get('negative', 0) + sentiment_counts.get('extreme negative', 0)

polarity = positive_count / negative_count
polarity[negative_count == 0] = float('inf')

plt.figure(figsize=(12, 6))
plt.plot(polarity.index, polarity.values, marker='o', linestyle='-', color='b', label='Polarity')
plt.title('Polarity Over Time (Month to Month)', fontsize=16)
plt.xlabel('Month', fontsize=12)
plt.ylabel('Polarity (Positive/Negative Ratio)', fontsize=12)
plt.xticks(rotation=45)
plt.legend()
plt.ylim(0.0, 1)

plt.tight_layout()
plt.show()


In [None]:
df['created_time'] = pd.to_datetime(df['created_time'])
df['month'] = df['created_time'].dt.to_period('M').astype(str)

df_controversial = df[df['controversiality'] == 1]

df_non_controversial = df[df['controversiality'] == 0]

sentiment_controversial = df_controversial.groupby(['month', 'predicted_label']).size().unstack(fill_value=0)
sentiment_non_controversial = df_non_controversial.groupby(['month', 'predicted_label']).size().unstack(fill_value=0)

positive_controversial = sentiment_controversial.get('positive', 0)
negative_controversial = sentiment_controversial.get('negative', 0) + sentiment_controversial.get('extreme negative', 0)
polarity_controversial = positive_controversial / negative_controversial
polarity_controversial[negative_controversial == 0] = float('inf')

positive_non_controversial = sentiment_non_controversial.get('positive', 0)
negative_non_controversial = sentiment_non_controversial.get('negative', 0) + sentiment_non_controversial.get('extreme negative', 0)
polarity_non_controversial = positive_non_controversial / negative_non_controversial
polarity_non_controversial[negative_non_controversial == 0] = float('inf')

plt.figure(figsize=(12, 6))

plt.plot(polarity_controversial.index, polarity_controversial.values, marker='o', linestyle='-', color='red', label='Controversial (1)')

plt.plot(polarity_non_controversial.index, polarity_non_controversial.values, marker='o', linestyle='-', color='blue', label='Non-Controversial (0)')

plt.title('Polarity Over Time: Controversial vs Non-Controversial Posts', fontsize=16)
plt.xlabel('Month', fontsize=12)
plt.ylabel('Polarity (Positive/Negative Ratio)', fontsize=12)
plt.xticks(rotation=45)
plt.legend()
plt.ylim(0.0, 1)

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

extreme_negative_data = df[df['predicted_label'] == 'extreme negative']

total_sentiments = df.groupby(['month', 'subreddit']).size()

extreme_negative_counts = extreme_negative_data.groupby(['month', 'subreddit']).size()

extreme_negative_proportions = (extreme_negative_counts / total_sentiments).fillna(0) * 100

extreme_negative_proportions = extreme_negative_proportions.reset_index()
extreme_negative_proportions.columns = ['Month', 'Subreddit', 'Proportion']

plt.figure(figsize=(12, 6))
sns.lineplot(
    data=extreme_negative_proportions,
    x='Month',
    y='Proportion',
    hue='Subreddit',
    marker='o'
)

plt.gca().yaxis.set_major_formatter(mticker.FuncFormatter(lambda y, _: f'{y}%'))

plt.title('Percentage of Extreme Negative Sentiment Over Time by Subreddit', fontsize=16)
plt.xlabel('Month', fontsize=12)
plt.ylabel('Percentage of Extreme Negative Sentiment', fontsize=12)
plt.xticks(rotation=45)
plt.legend(title='Subreddit', loc='upper right')
plt.tight_layout()
plt.show()


For my hypothesis:

In [None]:
df['extreme_negative'] = df['predicted_label'] == 'extreme negative'

polarization_by_subreddit = df.groupby(['month', 'subreddit'])['extreme_negative'].mean().reset_index()

print(polarization_by_subreddit.head())



In [None]:
from scipy.stats import f_oneway

subreddit_groups = [group['extreme_negative'].values for name, group in polarization_by_subreddit.groupby('subreddit')]
anova_result = f_oneway(*subreddit_groups)

print(f'ANOVA F-statistic: {anova_result.statistic}, p-value: {anova_result.pvalue}')


In [None]:
import statsmodels.api as sm
import pandas as pd

df['month_numeric'] = pd.to_datetime(df['month']).astype(int) / 10**9

X = sm.add_constant(df['month_numeric'])
y = df['extreme_negative']

model = sm.OLS(y, X).fit()

print(model.summary())


In [None]:
df['extreme_negative_numeric'] = df['extreme_negative'].astype(int)

model = ols('extreme_negative_numeric ~ C(subreddit)', data=df).fit()

anova_result = sm.stats.anova_lm(model, typ=2)
print(anova_result)

tukey_result = pairwise_tukeyhsd(df['extreme_negative_numeric'], df['subreddit'], alpha=0.05)
print(tukey_result)



In [None]:
print(df.columns)

In [None]:
from scipy import stats

oct24 = sentiment_proportions[sentiment_proportions['month'] == '2024-09']['extreme negative'] + sentiment_proportions[sentiment_proportions['month'] == '2024-09']['negative']
oct23 = sentiment_proportions[sentiment_proportions['month'] == '2023-11']['extreme negative'] + sentiment_proportions[sentiment_proportions['month'] == '2023-11']['negative']


t_stat, p_value = stats.ttest_ind(oct24, oct23, equal_var=False)

print(f"T-statistic: {t_stat}")
print(f"P-value: {p_value}")


In [None]:
from scipy import stats

before_oct24 = sentiment_proportions[sentiment_proportions['month'] == '2024-09']['positive']
after_oct23 = sentiment_proportions[sentiment_proportions['month'] =='2023-11']['positive']

t_stat, p_value = stats.ttest_ind(before_oct24, after_oct23)

print(f"T-statistic: {t_stat}")
print(f"P-value: {p_value}")

In [None]:
import matplotlib.pyplot as plt

df_filtered = df[df['controversiality'] == 1].copy()

df_filtered['created_time'] = pd.to_datetime(df_filtered['created_time'])

df_filtered['month'] = df_filtered['created_time'].dt.to_period('M').astype(str)

monthly_controversial_count = df_filtered.groupby('month').size()

plt.figure(figsize=(12, 6))
monthly_controversial_count.plot(kind='line', marker='o', color='red')
plt.title('Count of Controversial Posts by Month', fontsize=16)
plt.xlabel('Month', fontsize=12)
plt.ylabel('Count of Controversial Posts', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()



In [None]:
from scipy import stats

df['created_time'] = pd.to_datetime(df['created_time'])

before_oct = df[df['created_time'] < '2023-12-01']
after_oct = df[df['created_time'] >= '2024-12-01']

t_stat, p_value = stats.ttest_ind(before_oct['controversiality'], after_oct['controversiality'], equal_var=False)

print(before_oct)
print(f"T-statistic: {t_stat}")
print(f"P-value: {p_value}")
