In [22]:
# Reads the files with the results of the sentiment analysis and performs a basic data investigation

In [23]:
import numpy as np
import pandas as pd

import matplotlib as mpl
mpl.rcParams['svg.fonttype'] = 'none'

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter


### Data preparation

In [24]:
# There was an error in Colab which made some of the reply ids get saved as scientific notation, preventing merges
# with the original tweets. However, we can join those with the original dataframe of replies, made before the
# language model processing. It will give us the correct ids we need to combine them with the original tweets.
buggy_df = pd.read_csv("../../output/mvp/13.sentiment_analysis_dfs/protests.csv")
nice_df = pd.read_csv("../../output/mvp/8.dfs_for_sentiment_analysis/protest_tweets/protest-tweets.csv", sep="|",
                     # We need to read the ids for this one as str, so we can join them back later
                      dtype={
                          'tweet_id': str,
                          'conversation_id': str,
                          'in_reply_to': str
                      })

In [25]:
# Notice that 'nice_df' will lose some entries on the merge because we filtered out non-Enlgish tweets.
# We will end up wit the 53073 tweets for which there is classification data.
df = buggy_df.merge(nice_df, on='url')

In [26]:
# Now let's get rid of the redundant columns. From the buggy df, we want only the classification,
# which won't have any suffix. 
df = df.drop(columns=[column for column in df.columns if '_x' in column])

In [27]:
# And then we can rename the other columns to take away the _Y
df = df.rename(columns={
    label: label[:-2] for label in df.columns if "_y" in label
})

In [28]:
# Now we can remove all the unnamed and level_0 stuff
df = df.drop(columns=[column for column in df.columns if 'Unnamed' in column or 'level_0' in column or 'index' in column])

In [29]:
# Now we can finally merge them back with the original tweet data.
# Original tweet data
tweets = pd.read_csv("../../output/mvp/5.merged_dataframes/keywords.csv", sep="|", dtype={
    "conversation_id": str,
    "tweet_id": str,
})

# Keep only the top 1%
tweets = tweets[tweets.percentile_for_total_engagement <= 1]

In [30]:
# Here is the entirety of the classified data for climate protests.
protest_tweets = df.merge(tweets, left_on='in_reply_to', right_on='conversation_id', how='left')

In [31]:
# Remove the false positives
protest_tweets = protest_tweets[protest_tweets.classification != 'fp']

In [32]:
# But Twitter, in general, is also very negative, so we wan't to have a control.
# Let's perform the same analysis. This time we don't need to link back to the original files,
# since this wa made with the replies from 
# a random sample of all tweets.
control = pd.read_csv("../../output/mvp/13.sentiment_analysis_dfs/control.csv")
no_keywords = pd.read_csv("../../output/mvp/5.merged_dataframes/no-keywords.csv", sep="|")
control = control.merge(no_keywords, left_on='in_reply_to', right_on='tweet_id')
control['datetime'] = pd.to_datetime(control.datetime)
control = control.set_index('datetime')

#### Analysis

Is there a significant difference between the total negativity for disruptive and non-disruptive protests?

In [33]:
# All climate tweets
protest_tweets.sentiment_label.value_counts(normalize=True)

NEGATIVE    0.704697
POSITIVE    0.295303
Name: sentiment_label, dtype: float64

In [34]:
# Apparently there's no significant difference between the groups. It's all really negative.
protest_tweets.groupby("classification").sentiment_label.value_counts(normalize=True)

classification  sentiment_label
d               NEGATIVE           0.726428
                POSITIVE           0.273572
nd              NEGATIVE           0.698069
                POSITIVE           0.301931
Name: sentiment_label, dtype: float64

In [35]:
# Control group also looks similar.
control.sentiment_label.value_counts(normalize=True)

NEGATIVE    0.72431
POSITIVE    0.27569
Name: sentiment_label, dtype: float64

Looking at specific subgroups also doesn't change the picture a lot.

In [36]:
# Replies to tweets that mention greta
protest_tweets[protest_tweets.raw_content_y.str.lower().str.contains("greta|thunberg")]\
    .sentiment_label.value_counts(normalize=True)



NEGATIVE    0.696345
POSITIVE    0.303655
Name: sentiment_label, dtype: float64

In [37]:
# Replies to tweets that don't mention greta
protest_tweets[~protest_tweets.raw_content_y.str.lower().str.contains("greta|thunberg")]\
    .sentiment_label.value_counts(normalize=True)


NEGATIVE    0.723638
POSITIVE    0.276362
Name: sentiment_label, dtype: float64

In [38]:
# Replies to tweets that mention van gogh – seems higher!
protest_tweets[protest_tweets.raw_content_y.str.lower().str.contains("van gogh")]\
   .sentiment_label.value_counts(normalize=True)

NEGATIVE    0.780622
POSITIVE    0.219378
Name: sentiment_label, dtype: float64

In [39]:
# They are replies to the three tweets about the Sunflowers painting.
protest_tweets[protest_tweets.raw_content_y.str.lower().str.contains("van gogh")].url_y.value_counts()

https://twitter.com/AJEnglish/status/1580906271452151809    284
https://twitter.com/Reuters/status/1580954080729522179      225
https://twitter.com/Telegraph/status/1621786133716926465     38
Name: url_y, dtype: int64

In [40]:
# They are replies to the three tweets about the Sunflowers painting.
protest_tweets[protest_tweets.raw_content_y.str.lower().str.contains("van gogh|museum")].url_y.value_counts()

https://twitter.com/AJEnglish/status/1580906271452151809    284
https://twitter.com/Reuters/status/1580954080729522179      225
https://twitter.com/Telegraph/status/1621786133716926465     38
Name: url_y, dtype: int64

In [41]:
# Replies to tweets that mention extinction rebellion – oh that's interesting, why is it that good?
protest_tweets[protest_tweets.raw_content_y.str.lower().str.contains("extinction rebellion")]\
   .sentiment_label.value_counts(normalize=True)

POSITIVE    0.508772
NEGATIVE    0.491228
Name: sentiment_label, dtype: float64

In [42]:
# Well, replies to one single tweet that talks about a journalist beinga arrested.
# People are happy with the arrest, it seems.
protest_tweets[protest_tweets.raw_content_y.str.lower().str.contains("extinction rebellion")].url_y.value_counts()

https://twitter.com/guardian/status/1184548304341872641    57
Name: url_y, dtype: int64

As a final try, let's see which original tweets drove the most positive and negative responses.

In [None]:
counts = protest_tweets.groupby(["url_y"]).sentiment_label.value_counts(normalize=True)
counts = counts[:, 'NEGATIVE']
counts = counts.reset_index().merge(tweets, left_on='url_y', right_on='url')
counts = counts.rename(columns={'sentiment_label': 'negative_replies'})

In [None]:
control_counts = control.groupby(["url_y"]).sentiment_label.value_counts(normalize=True)
control_counts = control_counts[:, 'NEGATIVE']
control_counts = control_counts.reset_index()
control_counts = control_counts.rename(columns={'sentiment_label': 'negative_replies'})
control_counts["classification"] = "control"

#### Dataviz

In [None]:
# Initialize the matplotlib figure
dpi = 80
fig_width = 320 / dpi
fig_height = 380 / dpi
fig, ax = plt.subplots(figsize=(fig_width, fig_height), dpi=dpi)

palette = ["#96be00", "#eb6e14", "#00a6ff"]

median_d = data[data.classification=='d'].negative_replies.median()
median_nd = data[data.classification=='nd'].negative_replies.median()
median_control = data[data.classification=='control'].negative_replies.median()

sns.swarmplot(data, x='negative_replies', y='classification', hue='classification', legend=False, palette=palette, alpha=.7)

ax.axvline(median_control, color=palette[0], linestyle='--', label=f'Control median')
ax.axvline(median_d, color=palette[1], linestyle='--', label=f'Disruptive median')
ax.axvline(median_nd, color=palette[2], linestyle='--', label=f'Non-disruptive median')
    
# Remove the spines for a cleaner look
sns.despine(left=True, bottom=True)

# Define a function to format the ticks as percentages
def to_percent(x, _):
    return f"{100* x:.0f}%"

# Set the x-axis ticks to display percentages
ax.xaxis.set_major_formatter(FuncFormatter(to_percent))

# Set tick locations and labels for 0% and 100%
ax.set_xticks([0, .2, .4, .6, .8, 1])

# Background color
fig.set_facecolor('#f1f3f5')
ax.set_facecolor('#f1f3f5')

# Saves
plt.savefig("../../ai/swarmplot-sentiment.svg", format='svg', dpi=300, bbox_inches='tight', facecolor='#f1f3f5')

# Show the plot
plt.show()

In [None]:
### What are the median values?
print("Not climate", median_control)
print("Disruptive", median_d)
print("Non-disruptive", median_nd)
print("Both kinds of climate protest", data[data.classification.isin(["d", "nd"])].negative_replies.median())

In [None]:
# A example tweet for each value
for index, row in data[(data.negative_replies.round(2).mul(100).isin(range(66, 68)))
                      & (data.classification=='nd')].iterrows():
    print(row.url_y)

#### Finding examples

In [None]:
# Highest negativity in climate protests
counts.sort_values(by='negative_replies', ascending=False)

In [None]:
# Lowest negativity in climate protests
counts.sort_values(by='negative_replies', ascending=True)