## Chi-Square Test

In [811]:

# Example of the Chi-Squared Test
from scipy.stats import chi2_contingency, combine_pvalues
import pandas as pd
import numpy as np
import random

our_sentiment = pd.read_csv('dataset/sentiment.csv')
s_b_sentiment = pd.read_csv('dataset/sentiment.csv')

features = ['pos', 'neg', 'neut']
chi_square_p_values = {}

In [812]:
for feature in features:
    observed = our_sentiment[feature].to_numpy()
    expected = s_b_sentiment[feature].to_numpy()
    merged_df = pd.merge(our_sentiment, s_b_sentiment, on='sentence_id')
    contingency_table = pd.DataFrame({'observed': merged_df[feature + '_x'].value_counts(), 'expected': merged_df[feature + '_y'].value_counts()})
    contingency_table = contingency_table.fillna(0)
    res = chi2_contingency(contingency_table)
    print(res)
    chi_square_p_values[feature] = res[1]

(0.0, 1.0, 4, array([[2326., 2326.],
       [ 193.,  193.],
       [ 172.,  172.],
       [  97.,   97.],
       [  11.,   11.]]))
(0.0, 1.0, 4, array([[2.51e+03, 2.51e+03],
       [1.30e+02, 1.30e+02],
       [1.04e+02, 1.04e+02],
       [5.30e+01, 5.30e+01],
       [2.00e+00, 2.00e+00]]))
(0.0, 1.0, 4, array([[1468., 1468.],
       [ 549.,  549.],
       [ 527.,  527.],
       [ 232.,  232.],
       [  23.,   23.]]))


In [813]:
chi_square_p_values

{'pos': 1.0, 'neg': 1.0, 'neut': 1.0}

### Fisher method: Combine p-values of chi-squared

In [814]:
statistic, pval = combine_pvalues(list(chi_square_p_values.values()), method='fisher')
print('statistic: ', statistic)
print('p-value: ', pval)
print(f"Smaller than 0.05? {pval < 0.05}")

statistic:  -0.0
p-value:  1.0
Smaller than 0.05? False


# Kolmogorov-Smirnov-Test

In [815]:
from scipy.stats import ks_2samp, norm
from statsmodels.stats.multitest import multipletests
import pandas as pd
import numpy as np

our_sentiment = pd.read_csv('dataset/sentiment.csv')
s_b_sentiment = pd.read_csv('dataset/sentiment.csv')

features = ['pos', 'neg', 'neut']
kolmogorov_smirnov_p_values = {}

In [816]:
for feature in features:
    observed = our_sentiment[feature].to_numpy()
    expected = s_b_sentiment[feature].to_numpy()
    expected = expected / expected.sum() * observed.sum()
    stat, p = ks_2samp(observed, expected)
    kolmogorov_smirnov_p_values[feature] = p

In [817]:
kolmogorov_smirnov_p_values

{'pos': 1.0, 'neg': 1.0, 'neut': 1.0}

### Fisher method: Combine p-values of kolmogorov-smirnov-test

In [818]:
statistic, pval = combine_pvalues(list(kolmogorov_smirnov_p_values.values()), method='fisher')
print('p-value: ', pval)
print(f"Smaller than 0.05? {pval < 0.05}")

p-value:  1.0
Smaller than 0.05? False
