# Compute general stats

In [1]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import seaborn as sns
sns.set_theme()

In [2]:
data_path="./"

## YELP

In [3]:
df=pd.read_csv(data_path+"yelp_doc_senti_true.csv")
df_sent_tmp=pd.read_csv(data_path+"yelp_sent_sentiment_tmp_0609.csv")
df_valid_ids=pd.read_csv(data_path+"yelp_id_text_label.csv",encoding='utf8')

In [4]:

df=df.merge(df_sent_tmp,on='review_id',how='left')

df=df_valid_ids.merge(df,on='review_id',how='left')

mapping = {0: -1, 1: -0.5, 2: 0, 3: 0.5, 4: 1}

df['star_s'] = df['label']+1
df['label'] = df['label'].replace(mapping)

df.label=df.label*10

df=df.assign(label_peak_end_avg_abs_diff=abs(df.label-df.peak_end_avg))
df=df.assign(label_all_sent_avg_abs_diff=abs(df.label-df.all_sent_avg))

df=df.assign(pred_class=np.where(df.label_peak_end_avg_abs_diff<df.label_all_sent_avg_abs_diff,'C2','C1'))


In [5]:
df_subsets=df.loc[:999,['review_id','pred_class']]

df_subsets.to_csv("test_1k_subsets.csv",index=False)

### Num datapoints

In [6]:
df.pred_class.value_counts()

pred_class
C1    19557
C2    15294
Name: count, dtype: int64

In [7]:
sentences=pd.read_csv(data_path+"yelp_sent_senti_pred_0609.csv")

sentences=sentences.loc[sentences.sentence_id!=0]

sentences=sentences.loc[~sentences.sentence_text.isna()]

sentences['num_words']=sentences.sentence_text.str.split().apply(len)

num_sentences=sentences.groupby(['review_id']).agg({'sentence_id':'count','num_words':np.mean}).reset_index()

df=df.merge(num_sentences,on=['review_id'],how='left')

In [8]:
def get_unique_words(text):
    words = re.findall(r'\b\w+\b', text.lower())
    return set(words)

def unique_vocab(classes=['C1','C2']):
    unique_words = set()
    for index, row in df.loc[df.pred_class.isin(classes)].iterrows():
        unique_words.update(get_unique_words(row['review_text']))
    num_unique_words = len(unique_words)
    return num_unique_words

### unique vocab

In [9]:
print("All dataset: ",unique_vocab(classes=['C1','C2']))
print("C1 dataset: ",unique_vocab(classes=['C1']))
print("C2 dataset: ",unique_vocab(classes=['C2']))

All dataset:  64864
C1 dataset:  48889
C2 dataset:  44826


### General lambdas

In [10]:
df.loc[:,['label_peak_end_avg_abs_diff',
       'label_all_sent_avg_abs_diff','star_s','sentence_id','num_words']].mean()

label_peak_end_avg_abs_diff     4.484211
label_all_sent_avg_abs_diff     3.784661
star_s                          2.932742
sentence_id                    11.108031
num_words                      15.525584
dtype: float64

### Lambdas by group

In [11]:
round(df.groupby(['pred_class'])[['label_peak_end_avg_abs_diff',
       'label_all_sent_avg_abs_diff','star_s','sentence_id','num_words']].mean(),2)

Unnamed: 0_level_0,label_peak_end_avg_abs_diff,label_all_sent_avg_abs_diff,star_s,sentence_id,num_words
pred_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C1,6.05,2.97,2.74,11.3,15.55
C2,2.48,4.83,3.18,10.87,15.49


### PLOT

In [None]:

plt.figure(figsize=(7, 5),  dpi = 100)
plt.scatter(df.label_all_sent_avg_abs_diff,df.label_peak_end_avg_abs_diff, alpha=0.09,c=df.pred_class,edgecolors='none')
plt.xlabel("Value of λ1")
plt.ylabel("Value of λ2")
plt.savefig('emotion_arc_groups_v2.pdf', format='pdf', bbox_inches='tight',dpi=100)
#plt.ylim(0,50)
#plt.xlim(0,50)
plt.show()

## AMAZON

In [12]:
amazon=pd.read_csv(data_path+"amazon_doc_senti_true.csv")
am_sent=pd.read_csv(data_path+"amazon_sent_senti_pred.csv")
am_sent_tmp=pd.read_csv(data_path+"amazon_sent_sentiment_tmp.csv")

amazon=amazon.reset_index().loc[:,['index', 'review_text', 'true_label', 'stars']].rename(columns={'index':'review_id'})

In [13]:
amazon=amazon.merge(am_sent_tmp,on='review_id',how='left')
amazon['stars_s']=amazon.stars
amazon.stars=amazon.stars-1
mapping = {0: -1, 1: -0.5, 2: 0, 3: 0.5, 4: 1}

amazon['label'] = amazon['stars'].replace(mapping)

amazon.label=amazon.label*10

amazon=amazon.assign(label_peak_end_avg_abs_diff=abs(amazon.label-amazon.peak_end_avg))
amazon=amazon.assign(label_all_sent_avg_abs_diff=abs(amazon.label-amazon.all_sent_avg))

amazon=amazon.assign(pred_class=np.where(amazon.label_peak_end_avg_abs_diff<amazon.label_all_sent_avg_abs_diff,'C2','C1'))

### datapoints

In [14]:
amazon.pred_class.value_counts()

pred_class
C1    1393
C2    1189
Name: count, dtype: int64

In [15]:
amazon.shape

(2582, 14)

In [16]:
am_sent=am_sent.loc[am_sent.sentence_id!=0]

am_sent=am_sent.loc[~am_sent.sentence_text.isna()]
am_sent['num_words']=am_sent.sentence_text.str.split().apply(len)

am_num_sentences=am_sent.groupby(['review_id']).agg({'sentence_id':'count','num_words':np.mean}).reset_index()

In [17]:
def unique_vocab(classes=['C1','C2']):
    unique_words = set()
    for index, row in amazon.loc[amazon.pred_class.isin(classes)].iterrows():
        unique_words.update(get_unique_words(row['review_text']))
    num_unique_words = len(unique_words)
    return num_unique_words

### unique vocab

In [18]:
print("All dataset: ",unique_vocab(classes=['C1','C2']))
print("C1 dataset: ",unique_vocab(classes=['C1']))
print("C2 dataset: ",unique_vocab(classes=['C2']))

All dataset:  10271
C1 dataset:  7609
C2 dataset:  7049


### General lambdas

In [19]:
amazon=amazon.merge(am_num_sentences,on=['review_id'],how='left')

In [20]:
amazon.loc[:,['label_peak_end_avg_abs_diff',
       'label_all_sent_avg_abs_diff','stars_s','sentence_id','num_words']].mean()

label_peak_end_avg_abs_diff     4.214259
label_all_sent_avg_abs_diff     3.765537
stars_s                         2.936871
sentence_id                     6.702556
num_words                      11.044613
dtype: float64

### Lambdas by group

In [21]:
round(amazon.groupby(['pred_class'])[['label_peak_end_avg_abs_diff',
       'label_all_sent_avg_abs_diff','stars_s','sentence_id','num_words']].mean(),2)

Unnamed: 0_level_0,label_peak_end_avg_abs_diff,label_all_sent_avg_abs_diff,stars_s,sentence_id,num_words
pred_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C1,5.72,3.1,2.82,6.62,11.28
C2,2.45,4.55,3.07,6.8,10.77


### PLOT

In [None]:
plt.figure(figsize=(7, 5),  dpi = 100)
plt.scatter(amazon.label_all_sent_avg_abs_diff,amazon.label_peak_end_avg_abs_diff, alpha=0.09,c=amazon.pred_class,edgecolors='none')
plt.xlabel("Value of λ1")
plt.ylabel("Value of λ2")
plt.savefig('emotion_arc_groups.pdf', format='pdf', bbox_inches='tight',dpi=100)
#plt.ylim(0,50)
#plt.xlim(0,50)
plt.show()