In [1]:
import pandas as pd
import numpy as np
from pipeline import Pipeline

sentiment_pairs =[
    (pd.read_csv('models/en/results/model_en_1.csv'), pd.read_csv('models/de/results/model_de_1.csv')),
    (pd.read_csv('models/en/results/model_en_2.csv'), pd.read_csv('models/de/results/model_de_2.csv')),
    #(pd.read_csv('models/en/results/model_en_3.csv'), pd.read_csv('models/en/results/model_en_3.csv')), # faked because no de_3
    #(pd.read_csv('models/en/results/model_en_4.csv'), pd.read_csv('models/en/results/model_en_4.csv')),  # faked because no de_4
    #(pd.read_csv('models/en/results/model_en_4.csv'), pd.read_csv('models/en/results/model_en_4.csv'))  # faked because no en_5 de_5
]
counted_df = Pipeline().count_simulated_participants_choices(sentiment_pairs)

Counting simulated participants choices...


In [2]:
counted_df

Unnamed: 0,sentence_id,pos,neut,neg
0,9227,0,2,0
1,9229,0,1,1
2,9230,0,2,0
3,9231,0,2,0
4,9232,1,1,0
...,...,...,...,...
147836,175135,0,2,0
147837,175136,0,2,0
147838,175137,2,0,0
147839,175138,0,2,0


In [3]:
def reduce_to_one_choice(df):
    df["max_num"] = df[["pos", "neut", "neg"]].max(axis=1)
    df.loc[df["pos"] < df["max_num"], "pos"] = 0
    df.loc[df["pos"] == df["max_num"], "pos"] = 1
    df.loc[df["neut"] < df["max_num"], "neut"] = 0
    df.loc[df["neut"] == df["max_num"], "neut"] = 1
    df.loc[df["neg"] < df["max_num"], "neg"] = 0
    df.loc[df["neg"] == df["max_num"], "neg"] = 1
    df = df.drop("max_num", axis=1)
    return df

## Load Dataframes

In [4]:
our_sentiment = counted_df
s_b_sentiment = pd.read_csv('dataset/sentiment.csv')

In [5]:
s_b_sentiment[["un", "unsure", "pos", "neut", "neg"]].sum(axis=1).eq(5).any()
# Out[1]: False

False

In [6]:
s_b_sentiment['new_neut'] = s_b_sentiment['un'] + s_b_sentiment['unsure'] + s_b_sentiment['neut']
s_b_sentiment = s_b_sentiment.drop(['un', 'unsure', 'neut'], axis=1)
s_b_sentiment = s_b_sentiment.rename(columns={'new_neut': 'neut'})
features = ['pos', 'neg', 'neut']

In [7]:
our_sentiment = reduce_to_one_choice(our_sentiment)
our_sentiment

Unnamed: 0,sentence_id,pos,neut,neg
0,9227,0,1,0
1,9229,0,1,1
2,9230,0,1,0
3,9231,0,1,0
4,9232,1,1,0
...,...,...,...,...
147836,175135,0,1,0
147837,175136,0,1,0
147838,175137,1,0,0
147839,175138,0,1,0


In [8]:
s_b_sentiment = reduce_to_one_choice(s_b_sentiment)
s_b_sentiment

Unnamed: 0,sentence_id,neg,pos,neut
0,9385,0,1,0
1,9425,0,0,1
2,9447,0,0,1
3,9448,0,0,1
4,9613,0,1,0
...,...,...,...,...
2794,175035,0,1,1
2795,175089,0,0,1
2796,175106,0,0,1
2797,175107,0,1,0


## Chi-Square Test

In [9]:

# Example of the Chi-Squared Test
from scipy.stats import chi2_contingency, combine_pvalues
import pandas as pd

chi_square_p_values = {}

In [10]:
for feature in features:
    observed = our_sentiment[feature].to_numpy()
    expected = s_b_sentiment[feature].to_numpy()
    merged_df = pd.merge(our_sentiment, s_b_sentiment, on='sentence_id')
    contingency_table = pd.DataFrame({'observed': merged_df[feature + '_x'].value_counts(), 'expected': merged_df[feature + '_y'].value_counts()})
    contingency_table = contingency_table.fillna(0)
    res = chi2_contingency(contingency_table)
    chi_square_p_values[feature] = round(res[1], 4)

In [11]:
chi_square_p_values

{'pos': 0.0, 'neg': 0.0, 'neut': 0.0007}

### Fisher method: Combine p-values of chi-squared

In [12]:
statistic, pval = combine_pvalues(list(chi_square_p_values.values()), method='fisher')
print('statistic: ', statistic)
print('p-value: ', pval)
print(f"Smaller than 0.05? {pval < 0.05}")

statistic:  inf
p-value:  0.0
Smaller than 0.05? True


  statistic = -2 * np.sum(np.log(pvalues))


# Kolmogorov-Smirnov-Test

In [13]:
from scipy.stats import ks_2samp, norm
from statsmodels.stats.multitest import multipletests
import pandas as pd

kolmogorov_smirnov_p_values = {}

In [14]:
for feature in features:
    observed = our_sentiment[feature].to_numpy()
    expected = s_b_sentiment[feature].to_numpy()
    expected = expected / expected.sum() * observed.sum()
    stat, p = ks_2samp(observed, expected)
    kolmogorov_smirnov_p_values[feature] = round(p, 4)

In [15]:
kolmogorov_smirnov_p_values

{'pos': 0.0, 'neg': 0.0, 'neut': 0.0}

### Fisher method: Combine p-values of kolmogorov-smirnov-test

In [16]:
statistic, pval = combine_pvalues(list(kolmogorov_smirnov_p_values.values()), method='fisher')
print('p-value: ', pval)
print(f"Smaller than 0.05? {pval < 0.05}")

p-value:  0.0
Smaller than 0.05? True
