## Chi-Square Test

In [729]:

# Example of the Chi-Squared Test
from scipy.stats import chi2_contingency, combine_pvalues
import pandas as pd
import numpy as np
import random

our_sentiment = pd.read_csv('dataset/sentiment_test.csv')
s_b_sentiment = pd.read_csv('dataset/sentiment.csv')

features = ['pos', 'neg', 'neut']
chi_square_p_values = {}

In [730]:
for feature in features:
    observed = our_sentiment[feature].to_numpy()
    expected = s_b_sentiment[feature].to_numpy()
    merged_df = pd.merge(our_sentiment, s_b_sentiment, on='sentence_id')
    contingency_table = pd.DataFrame({'observed': merged_df['pos_x'].value_counts(), 'expected': merged_df['pos_y'].value_counts()})
    contingency_table = contingency_table.fillna(0)
    res = chi2_contingency(contingency_table)
    chi_square_p_values[feature] = res[1]

In [731]:
chi_square_p_values

{'pos': 5.531185435643268e-44,
 'neg': 5.531185435643268e-44,
 'neut': 5.531185435643268e-44}

### Fisher method: Combine p-values of chi-squared

In [732]:
statistic, pval = combine_pvalues(list(chi_square_p_values.values()), method='fisher')
print('p-value: ', pval)
print(f"Smaller than 0.05? {pval < 0.05}")

p-value:  7.60539536047689e-126
Smaller than 0.05? True


# Kolmogorov-Smirnov-Test

In [733]:
from scipy.stats import ks_2samp, norm
from statsmodels.stats.multitest import multipletests
import pandas as pd
import numpy as np

our_sentiment = pd.read_csv('dataset/sentiment_test.csv')
s_b_sentiment = pd.read_csv('dataset/sentiment.csv')

features = ['pos', 'neg', 'neut']
kolmogorov_smirnov_p_values = {}

In [734]:
z_sum = 0
for feature in features:
    observed = our_sentiment[feature].to_numpy()
    expected = s_b_sentiment[feature].to_numpy()
    expected = expected / expected.sum() * observed.sum()
    stat, p = ks_2samp(observed, expected)
    z_sum += norm.ppf(1 - p)
    kolmogorov_smirnov_p_values[feature] = p

In [735]:
kolmogorov_smirnov_p_values

{'pos': 4.769864577993995e-13, 'neg': 1.0, 'neut': 1.0}

### Fisher method: Combine p-values of kolmogorov-smirnov-test

In [736]:
statistic, pval = combine_pvalues(list(kolmogorov_smirnov_p_values.values()), method='fisher')
print('p-value: ', pval)
print(f"Smaller than 0.05? {pval < 0.05}")

p-value:  2.059800613850802e-10
Smaller than 0.05? True
