# This Notebook takes a processed DF and evaluates change in emotions

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import linear_model
from tqdm import tqdm

In [17]:
def get_all_authors(grouped):
    return list(grouped['author'].unique())

"""
Fit a linear model to a time-series data consisting of time and emotion scores.
Positive slope value indicates improvement of the emotion score over time.
"""
def get_slope(X, y):
    X= np.array(X, dtype = 'float').reshape(-1, 1)
    y = np.array(y.values, dtype = 'float')
    lm = linear_model.LinearRegression()
    lm.fit(X,y)
    return float(lm.coef_)

"""
Group DF by author and date of posting, 
aggregated by the mean of emotion scores for the day
"""
def get_aggregated_df(agg_cols):
    groupby_str = f"df_subreddit.groupby(by =['author',pd.Grouper(key='isodate', axis=0, freq='D')] ).agg("
    for col in agg_cols:
        col_string = f"{col}=('{col}', np.mean),"
        groupby_str += col_string
    
    groupby_str = groupby_str[:-1]
    groupby_str += ").reset_index()"
    return eval(groupby_str)


"""
For every author, get the change in emotion (slope)
"""
def get_authors_change_data(columns, grouped):
    authors_list = []
    all_authors = get_all_authors(grouped)

    for i in tqdm(range(len(all_authors))):
        authors_dict = {}
        author = all_authors[i]

        author_df = grouped[grouped['author'] == author]
        X = author_df['isodate']
        authors_dict['author'] = author

        for col in columns:
            y = author_df[col]
            slope = get_slope(X,y)
            authors_dict[col] = slope

        authors_list.append(authors_dict)

    authors_stats = pd.DataFrame.from_dict(authors_list, orient = 'columns')
    return authors_stats

In [44]:
"""
Load a processed DF
"""
subreddit_name = "Nietzsche"
file_name = f"{subreddit_name}/{subreddit_name}_mentalhealth_common_author_posts"

df_subreddit = pd.read_csv(f'{file_name}_processed.csv')
df_subreddit['isodate'] = pd.to_datetime(df_subreddit["isodate"])
df_subreddit.drop(['index'], axis = 1, inplace = True)
print(df_subreddit.shape)
df_subreddit = df_subreddit[df_subreddit['subreddit']=='CPTSD']

(25183, 213)


In [45]:
"""
Get emotion change data
"""

columns = list(df_subreddit.columns[6:])
grouped = get_aggregated_df(columns)
df_change_data = get_authors_change_data(columns, grouped)
df_change_data.head()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28/28 [00:07<00:00,  3.55it/s]


Unnamed: 0,author,Polarity,Subjectivity,emp_help,emp_office,emp_dance,emp_money,emp_wedding,emp_domestic_work,emp_sleep,...,nrc_anger,nrc_anticip,nrc_trust,nrc_surprise,nrc_positive,nrc_negative,nrc_sadness,nrc_disgust,nrc_joy,nrc_anticipation
0,Apprehensive-Net6834,-4.7555950000000005e-18,-1.006315e-17,-8.084147999999999e-19,0.0,0.0,0.0,-1.61683e-18,5.0005039999999996e-20,8.584197999999999e-19,...,1.417948e-18,0.0,-4.072829e-18,1.749808e-18,-1.448117e-18,7.874135e-18,-1.870484e-18,-3.3186009999999995e-19,-1.870484e-18,7.542275e-19
1,Aristocrated,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,FairInvestigator,-9.380323e-18,-5.480358e-18,-1.0463810000000001e-18,7.956084999999999e-19,-3.16617e-18,7.956084999999999e-19,1.869016e-18,-1.0978200000000001e-18,-8.452127999999999e-19,...,5.303107e-18,0.0,-1.6341260000000002e-17,1.2594e-18,-1.773811e-17,1.2271320000000001e-17,9.229724e-18,4.926383e-18,-1.0122970000000002e-17,3.895289e-18
3,Funny-Frame-4238,-8.048736e-18,4.393959e-18,-4.0217419999999996e-19,-1.749439e-19,0.0,-1.1662929999999998e-19,-8.103766999999999e-19,-1.195122e-18,-1.9237269999999998e-19,...,3.18926e-18,0.0,-2.303291e-18,-2.364261e-20,-5.29447e-18,4.522114e-18,2.23432e-18,-7.500889999999999e-19,-1.084393e-18,-3.8158959999999996e-19
4,Jakesteroz,3.476241e-16,-7.911247e-16,1.286008e-16,0.0,-9.645062e-17,0.0,0.0,1.286008e-16,0.0,...,-4.902906e-16,0.0,4.902906e-16,-2.371078e-16,4.902906e-16,-2.491641e-16,7.032858000000001e-17,-2.371078e-16,2.411265e-16,3.215021e-16


In [46]:
"""
Observe the mean change towards +ve or -ve
"""
print(df_change_data.describe())

           Polarity  Subjectivity      emp_help    emp_office     emp_dance  \
count  2.800000e+01  2.800000e+01  2.800000e+01  2.800000e+01  2.800000e+01   
mean   1.180155e-16 -2.205625e-17  9.738577e-18  3.681924e-19 -3.592779e-18   
std    6.597535e-16  1.914350e-16  3.194905e-17  2.721385e-18  1.820892e-17   
min   -4.263083e-16 -7.911247e-16 -1.046381e-18 -2.583386e-18 -9.645062e-17   
25%   -8.436293e-19 -5.453752e-18  0.000000e+00  0.000000e+00  0.000000e+00   
50%    0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   
75%    0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   
max    3.439990e-15  5.273454e-16  1.286008e-16  1.386117e-17  0.000000e+00   

          emp_money   emp_wedding  emp_domestic_work     emp_sleep  \
count  2.800000e+01  2.800000e+01       2.800000e+01  2.800000e+01   
mean   4.839206e-19  5.507134e-19       5.413219e-18  1.143203e-18   
std    1.642556e-18  2.762549e-18       2.616212e-17  4.226192e-18   
min   -2

In [47]:
"""
Box plot of NRC Emotions
"""
df_change_data.iloc[:,-11:].boxplot(showfliers=False, showmeans=True)
plt.xticks(rotation=90)

(array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,  1,  2,  3,  4,  5,  6,
         7,  8,  9, 10, 11,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,  1,
         2,  3,  4,  5,  6,  7,  8,  9, 10, 11,  1,  2,  3,  4,  5,  6,  7,
         8,  9, 10, 11,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,  1,  2,
         3,  4,  5,  6,  7,  8,  9, 10, 11]),
 [Text(1, 0, 'nrc_fear'),
  Text(2, 0, 'nrc_anger'),
  Text(3, 0, 'nrc_anticip'),
  Text(4, 0, 'nrc_trust'),
  Text(5, 0, 'nrc_surprise'),
  Text(6, 0, 'nrc_positive'),
  Text(7, 0, 'nrc_negative'),
  Text(8, 0, 'nrc_sadness'),
  Text(9, 0, 'nrc_disgust'),
  Text(10, 0, 'nrc_joy'),
  Text(11, 0, 'nrc_anticipation'),
  Text(1, 0, 'nrc_fear'),
  Text(2, 0, 'nrc_anger'),
  Text(3, 0, 'nrc_anticip'),
  Text(4, 0, 'nrc_trust'),
  Text(5, 0, 'nrc_surprise'),
  Text(6, 0, 'nrc_positive'),
  Text(7, 0, 'nrc_negative'),
  Text(8, 0, 'nrc_sadness'),
  Text(9, 0, 'nrc_disgust'),
  Text(10, 0, 'nrc_joy'),
  Text(11, 0, 'nrc_anticipation'),
  Text(1, 0, '

In [48]:
"""
Save evaluated file
"""

df_change_data.to_csv(f'{file_name}_CPTSD_evaluated.csv', index = False)