# Merge sentiment graphs
Not in script format yet

In [1]:
'''
    Outputs overall sentiment (with rounded polarity) and sentiment over time (frequency bins).
    Also computes overall average sentiment.
'''

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates # plot sentiment over time
import seaborn as sns

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# progress bar
from tqdm import tqdm
tqdm.pandas()

In [2]:
# overall
NUM_SEGMENTS = 34 # decided on 34 segments for overall data
# Input/output files for overall data
DATA_IN = "../datain/sentiment/cleaned_tweets_for_sentiment.csv"
ROUNDED_POLARITY_OUT = "../dataout/sentiment/rounded_sentiment_overall.jpeg"
SENTIMENT_OVER_TIME_PER_SEGMENT_OUT = '../dataout/sentiment/sentiment_per_segment_overall.pdf'

In [3]:
def clean_sentiment_data(df):
    '''
        Load & clean data.

        Args:
            df: dataframe containing the sentiment data
        Returns:
            df: cleaned dataframe
    '''
    # remove all null created_at values from dataframe
    df = df.drop(df[df['created_at'].isnull()].index)
    df = df.drop(df[df['cleaned_tweet'].isnull()].index)
    # ensure that all values in created_at has 2021 (and not random strings)
    df = df[df['created_at'].str.contains("2021")]

    # split created_at into date and time columns
    # https://intellipaat.com/community/13909/python-how-can-i-split-a-column-with-both-date-and-time-e-g-2019-07-02-00-12-32-utc-into-two-separate-columns
    df['created_at'] = pd.to_datetime(df['created_at'])
    df['date'] = df['created_at'].dt.date
    df['time'] = df['created_at'].dt.time

    return df


def sentiment_polarity_score(df, overall=True, selected_topic=0, filename=ROUNDED_POLARITY_OUT):
    '''
        Calculates the sentiment polarity score.

        Args:
            df: cleaned dataframe with tweet data
            overall: boolean (true if want to analyse overall data frequency, false if not)
            selected_topic: the topic number of the topic to be analysed.
            filename: path to the file to which this function will output to.
        Returns:
            df: dataframe with Vader sentiment polarity score columns added.
    '''
    analyzer = SentimentIntensityAnalyzer()

    # add polarity scores to df
    # https://github.com/sidneykung/twitter_hate_speech_detection/blob/master/preprocessing/VADER_sentiment.ipynb
    print(f"\t\tGetting sentiment polarity scores...")
    pol = lambda x: analyzer.polarity_scores(x)
    df['polarity'] = df["cleaned_tweet"].progress_apply(pol)

    # split polarity scores into separate columns
    print(f"\t\tSplitting polarity scores into columns...")
    df = pd.concat([df.drop(['polarity'], axis=1), df['polarity'].progress_apply(pd.Series)], axis=1)

    # get rounded polarity score
    round_pol = lambda x: calc_polarity(x, 0.05)
    # round polarity up/down
    df['rounded_polarity'] = df['compound'].apply(round_pol)

    # get amount of rounded negative, neutral, and positive polarity
    num_rounded_sentiments = df.groupby('rounded_polarity').count()
    plot_rounded_polarity(num_rounded_sentiments, overall, selected_topic, filename)

    return df


def calc_polarity(x, bound):
    '''
        Round polarity up/down based on bound.

        Args:
            x: 
            bound:
        Returns:
            int: -1 if x is less than -bound, 1 greater than bound, or 0
    '''
    if x < -bound:
        return -1
    elif x > bound:
        return 1
    else:
        return 0

def plot_rounded_polarity(num_rounded_sentiments, overall, selected_topic, filename):
    '''
        Plot rounded polariry.
        Called by sentiment_polarity_score().

        Args:
            num_rounded_sentiments: dataframe grouped by rounded polarity
            overall: boolean (true if want to analyse overall data frequency, false if not)
            selected_topic: the topic number of the topic to be analysed.
            filename: path to the file to which this function will output to.
    '''
    # plot rounded negative, neutral, and positive sentiment amounts
    plt.bar(num_rounded_sentiments.index, num_rounded_sentiments["compound"])
    if overall:
        plt.title('Overall Rounded Sentiment')
    else:
        plt.title(f'Topic {selected_topic} Rounded Sentiment')

    plt.xlabel('Polarity')
    plt.ylabel('Count')
    plt.savefig(filename)
    plt.close()


def split_data_segments(df, num_segments=NUM_SEGMENTS):
    '''
        Split data into segments according to date.

        Args:
            df: dataframe with Vader sentiment polarity score columns added.
            num_segments: number of equal segments that the data needs to be split into.
        Returns:
            df: sorted df by date
            sub_dfs: a list of subdataframes of df
            num_segments: number of equal segments that the data needs to be split into.
    '''
    # sort dataframe by date
    df = df.sort_values(by=['date', 'time'])
    # list of dfs
    sub_dfs = list(split(df, num_segments))
    return df, sub_dfs, num_segments


def split(df, n):
    '''
        Split df into n groups of equal length (returns list of sub dataframes).
        https://stackoverflow.com/questions/2130016/splitting-a-list-into-n-parts-of-approximately-equal-length

        Args:
            df: dataframe that should be split
            n: number of equal segments that the data needs to be split into.
        Retuns:
            sub dataframe according to df and n
    '''
    k, m = divmod(len(df), n)
    return (df[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))


def sentiment_per_segment(df, sub_dfs, num_segments, num_tweets_per_segment, overall=True, selected_topic=0, filename=SENTIMENT_OVER_TIME_PER_SEGMENT_OUT):
    '''
        Get average sentiment & plot sentiment over time.

        Args:
            df: sorted df by date
            sub_dfs: a list of subdataframes of df
            num_segments: number of equal segments that the data needs to be split into.
            num_tweets_per_segment: number of tweets per segment.
            overall: boolean (true if want to analyse overall data frequency, false if not)
            selected_topic: the topic number of the topic to be analysed.
            filename: path to the file to which this function will output to.
        Returns:
            avg_sentiment: the average sentiment over the entire timeperiod for the data.
    '''
    compounds = []
    mns, mxs = [], []
    dates = []
    for sub_df in sub_dfs:
        compounds.append(sub_df.compound.mean())
        mxs.append(sub_df.index.max())
        mns.append(sub_df.index.min())
        dates.append(sub_df.date.iloc[0])

    compound_df = pd.DataFrame(dict(
        mn=mns,
        mx=mxs,
        compouned=compounds,
        date=dates,
    ))
    
    return compound_df, num_segments, num_tweets_per_segment, overall, selected_topic, filename

#     plot_sentiment_over_time(compound_df, num_segments, num_tweets_per_segment, overall, selected_topic, filename)
    
#     # average overall sentiment
#     avg_sentiment = df['compound'].mean()
#     return avg_sentiment

# Get overall data

In [4]:
print("Applying overall sentiment analysis on segments over time...")
# load cleaned tweet corpus data
df = pd.read_csv(DATA_IN)
df = df.drop("Unnamed: 0", axis=1)

df = clean_sentiment_data(df)

df = sentiment_polarity_score(df)
# segments
df, sub_dfs, num_segments = split_data_segments(df)
num_tweets_per_segment = round(len(sub_dfs[0]) / 1000, 1)
overall_compound_df, overall_num_segments, overall_num_tweets_per_segment, overall, selected_topic, filename = sentiment_per_segment(df, sub_dfs, num_segments, num_tweets_per_segment)

Applying overall sentiment analysis on segments over time...
		Getting sentiment polarity scores...


100%|█████████████████████████████████| 406565/406565 [00:48<00:00, 8391.59it/s]


		Splitting polarity scores into columns...


100%|█████████████████████████████████| 406565/406565 [02:04<00:00, 3261.76it/s]


NameError: name 'avg_sentiment' is not defined

# Get largest topic data

In [16]:
SENTIMENT_DATA_IN_PREFIX = "../datain/sentiment/"
SENTIMENT_DATA_OUT_PREFIX = "../dataout/sentiment/"
BTM_SCORES_DATA_IN = "../BTM_topics/dataout/"
BTM_DATA_IN_PREFIX = "../datain/topic_modelling/"

In [17]:
def load_data(optimal_num_topics=11):
    '''
        Get data.

        Args:
            optimal_num_topics: optimal number of topics identified by the ElbowMethod (using the R BTM LogLik values)
        Returns
            df: loaded BTM scores dataframe
    '''
    filename = BTM_SCORES_DATA_IN + f"{optimal_num_topics}_model_scores.csv"
    df = pd.read_csv(filename)

    # change index to id
    df = df.rename({'Unnamed: 0': 'id'}, axis=1) # rename column
    df['id'] = df['id'].astype('int64')
    df.set_index("id", inplace = True)

    # rename column headers to integer representations
    for i in range(1, len(df.columns) + 1):
        colname = "V" + str(i)
        df = df.rename({colname: i}, axis=1)
    
    return df

def match_topic_with_tweet(df):
    '''
        Get the topic that a tweet is most likely part of based on the 
        probablity that they're in the topic.

        Args:
            df: loaded BTM scores dataframe
        Returns:
            df: df with a column indicating their most probable topic
    '''
    maxtopic = df
    # get the topic with the max probability value for each row
    maxtopic = maxtopic.idxmax(axis=1)
    # convert all topics from string ('15') to int (15). This prerpares it for grouping by topic
    maxtopic = maxtopic.astype(int)

    # add maxtopic as a new column
    df.insert(0, "maxtopic", maxtopic)

    # sort by maxtopic
    df = df.sort_values('maxtopic')

    return df

def sentiment_get_matching_topic_data(selected_topic):
    '''
        Get the subset of the topic modelling data from the cleaned sentiment data 
        (use the topic IDs to get the sentiment data matching those ids).

        Args:
            selected_topic: the topic number of the topic to be analysed.
        Returns:
            selected_topic_sentiment_df: subset of cleaned sentiment data that matches the selected topic's tweet ids.
    '''
    filename = SENTIMENT_DATA_IN_PREFIX + "cleaned_tweets_for_sentiment.csv"
    # load cleaned tweet corpus data
    cleaned_sentiment_df = pd.read_csv(filename)
    cleaned_sentiment_df = cleaned_sentiment_df.drop("Unnamed: 0", axis=1)

    # load topic ids
    filename = SENTIMENT_DATA_IN_PREFIX + f"ids_topic_{selected_topic}.csv"
    selected_topic_ids = pd.read_csv(filename)

    # subset sentiment data with topic ids
    selected_topic_sentiment_df = selected_topic_ids.merge(cleaned_sentiment_df, on='id', how='left')

    # export selected topic sentiment to csv
    filename = BTM_DATA_IN_PREFIX + f"tweet_sentiment_subdf_topic_{selected_topic}.csv"
    selected_topic_sentiment_df.to_csv(filename)

    return selected_topic_sentiment_df

In [33]:
NUM_SEGMENTS = 40
optimal_num_topics = 11
df = load_data(optimal_num_topics)
df = match_topic_with_tweet(df)

In [9]:
selected_topic = 11

In [35]:
print("\tGetting topic sentiment...")
# sentiment analysis
df = sentiment_get_matching_topic_data(selected_topic)
df = clean_sentiment_data(df)
filename = SENTIMENT_DATA_OUT_PREFIX + f"rounded_sentiment_topic_{selected_topic}.pdf"

df = sentiment_polarity_score(df, False, selected_topic, filename)
# segments
df, sub_dfs, num_segments = split_data_segments(df, NUM_SEGMENTS)
num_tweets_per_segment = round(len(sub_dfs[0]) / 1000, 1)
filename = SENTIMENT_DATA_OUT_PREFIX + f"sentiment_per_segment_topic_{selected_topic}.pdf"
lt_compound_df, lt_num_segments, lt_num_tweets_per_segment, overall, selected_topic, filename = sentiment_per_segment(df, sub_dfs, num_segments, num_tweets_per_segment, False, selected_topic, filename)

	Getting topic sentiment...
		Getting sentiment polarity scores...


100%|██████████████████████████████████████████████████████| 100879/100879 [00:14<00:00, 6995.00it/s]


		Splitting polarity scores into columns...


100%|██████████████████████████████████████████████████████| 100879/100879 [00:31<00:00, 3159.14it/s]


# Plot

In [68]:
def plot_sentiment_over_time(overall_compound_df, overall_num_segments, overall_num_tweets_per_segment, lt_compound_df, lt_num_segments, lt_num_tweets_per_segment, selected_topic, filename):
    '''
        Plot sentiment over time.
    '''
    fig, ax = plt.subplots()
    ax.plot(overall_compound_df.date, 'compouned', data=overall_compound_df, label="Overall")
    ax.plot(lt_compound_df.date, 'compouned', data=lt_compound_df, label=f"Topic {selected_topic}")

    # Major ticks every month.
    fmt_month = mdates.MonthLocator()
    ax.xaxis.set_major_locator(fmt_month)
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))

    # plot
    plt.title(f'Overall and Topic {selected_topic} Sentiment per segment')
    
    plt.legend(loc="lower left")
    plt.xlabel('Date')
    plt.ylabel('Vader Sentiment score')
    
    plt.figtext(0.14,-0.05,f'Overall: {overall_num_segments} segments of ~{overall_num_tweets_per_segment}k, Topic {selected_topic}: {lt_num_segments} segments of ~{lt_num_tweets_per_segment}k')
    # save graph
    plt.savefig(filename)
    plt.close()

In [70]:
filename = SENTIMENT_DATA_OUT_PREFIX + f"sentiment_per_segment_topic_overall_{selected_topic}.pdf"


In [71]:
plot_sentiment_over_time(overall_compound_df, overall_num_segments, overall_num_tweets_per_segment, lt_compound_df, lt_num_segments, lt_num_tweets_per_segment, selected_topic, filename)