## Notebook Containing Model Evaluation Pipelines
### ACL22 SRW confidential submission
---

In [1]:
import pickle
import pandas as pd
from math import sqrt 
from scipy.stats import norm
from random import choices
from scipy import stats
import random
from statistics import mean, stdev
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt 

import warnings
warnings.simplefilter('ignore')
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import BertModel, BertTokenizer
from transformers import logging
logging.set_verbosity_error()

from helpers import bootstrap, get_bert_average_across_text_tokens

In [2]:
# authorship profiling internal vars

experiment_name = 'final-refined-set-paper' #folder name for saving new outputs

write_model_dir = 'models/'+experiment_name
if not os.path.isdir(write_model_dir):
    os.mkdir(write_model_dir)

In [3]:
"""
1. gender profiling validation mixed instagram + twitter
"""

# first do LR

with open('models/final-refined-set-paper/final-profiling-LR-mix.pkl', 'rb') as f:
    model_SVM = pickle.load(f)
print(model_SVM, '\n')

test_df = pd.read_csv('acl22-data/intermediate-data/cleaned-data-authorship-profiling/final-refined-set/facebook_celeb_profiling.csv')
test_df['target'][test_df['target']=='F'] = 0
test_df['target'][test_df['target']=='M'] = 1
test_df[['post', 'target']].to_csv('acl22-data/final-data/cleaned-data-authorship-profiling/final-refined-set/facebook_celeb_profiling_test.csv')

test_df = pd.read_csv('acl22-data/final-data/cleaned-data-authorship-profiling/final-refined-set/facebook_celeb_profiling_test.csv')
y_val = test_df['target'].to_numpy(dtype = int)
print(f"There are {str(len(test_df['target'][test_df['target']==0])/len(test_df['target'])*100)[0:5]}% of Female profiles in the dataset. \n")

X = pd.read_csv('acl22-data/final-data/cleaned-data-authorship-profiling/final-refined-set/mixed_celeb_profiling_folded.csv')
X = X[["post","target"]]
feature_label = 'post'

vectorizer = TfidfVectorizer(
    analyzer='word', 
    sublinear_tf=True, 
    strip_accents='unicode',
    ngram_range=(2, 2),
    max_features=10000)

X_train,  y_train = [
        X[feature_label], 
        X["target"] 
      ]
    #print(y_val)
    # redefine Xs with tfidf
X_train_tf = vectorizer.fit_transform(X_train)
X_train_tf = vectorizer.transform(X_train)
print("n_samples train: %d, n_features train: %d" % X_train_tf.shape)
    
X_test_tf = vectorizer.transform(test_df["post"])
print("n_samples val: %d, n_features val: %d" % X_test_tf.shape, '\n')
y_pred = model_SVM.predict(X_test_tf)


accuracy = accuracy_score(y_val, y_pred)
roc_auc = roc_auc_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
    

results = {'accuracy':accuracy, 'roc_auc':roc_auc, 'f1':f1}
print(results, '\n')
print(confusion_matrix(y_val, y_pred), '\n')

confidence_level=0.95
lower, median,upper,f1s=bootstrap(y_val, y_pred, B=1000,confidence_level=confidence_level)
print("%.3f, %s%% Bootstrap confidence interval: [%.3f, %.3f]" % (median, confidence_level*100, lower, upper), '\n')
pd.DataFrame(f1s).T.to_csv('models/final-refined-set-paper/eval_dfs/f1s_profiling_LR_mix.csv')

# second do SVM

with open('models/final-refined-set-paper/final-profiling-SVM-mix.pkl', 'rb') as f:
    model_SVM = pickle.load(f)
print('\n', model_SVM, '\n')

test_df = pd.read_csv('acl22-data/final-data/cleaned-data-authorship-profiling/final-refined-set/facebook_celeb_profiling_test.csv')
y_val = test_df['target'].to_numpy(dtype = int)
print(f"There are {str(len(test_df['target'][test_df['target']==0])/len(test_df['target'])*100)[0:5]}% of Female profiles in the dataset. \n")

X = pd.read_csv('acl22-data/final-data/cleaned-data-authorship-profiling/final-refined-set/mixed_celeb_profiling_folded.csv')
X = X[["post","target"]]
feature_label = 'post'

vectorizer = TfidfVectorizer(
    analyzer='word', 
    sublinear_tf=True, 
    strip_accents='unicode',
    ngram_range=(2, 2),
    max_features=10000)

X_train,  y_train = [
        X[feature_label], 
        X["target"] 
      ]
    #print(y_val)
    # redefine Xs with tfidf
X_train_tf = vectorizer.fit_transform(X_train)
X_train_tf = vectorizer.transform(X_train)
print("n_samples train: %d, n_features train: %d" % X_train_tf.shape)
    
X_test_tf = vectorizer.transform(test_df["post"])
print("n_samples val: %d, n_features val: %d" % X_test_tf.shape, '\n')
y_pred = model_SVM.predict(X_test_tf)

accuracy = accuracy_score(y_val, y_pred)
roc_auc = roc_auc_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
    

results = {'accuracy':accuracy, 'roc_auc':roc_auc, 'f1':f1}
print(results, '\n')
print(confusion_matrix(y_val, y_pred), '\n')

confidence_level=0.95
lower, median,upper,f1s=bootstrap(y_val, y_pred, B=1000,confidence_level=confidence_level)
print("%.3f, %s%% Bootstrap confidence interval: [%.3f, %.3f]" % (median, confidence_level*100, lower, upper), '\n')
pd.DataFrame(f1s).T.to_csv('models/final-refined-set-paper/eval_dfs/f1s_profiling_SVM_mix.csv')

LogisticRegression(class_weight='balanced', solver='newton-cg') 

There are 50.0% of Female profiles in the dataset. 

n_samples train: 372, n_features train: 10000
n_samples val: 50, n_features val: 10000 

{'accuracy': 0.64, 'roc_auc': 0.6399999999999999, 'f1': 0.7272727272727272} 

[[ 8 17]
 [ 1 24]] 

0.730, 95.0% Bootstrap confidence interval: [0.593, 0.829] 


 SVC(C=1.2, class_weight='balanced', kernel='linear') 

There are 50.0% of Female profiles in the dataset. 

n_samples train: 372, n_features train: 10000
n_samples val: 50, n_features val: 10000 

{'accuracy': 0.6, 'roc_auc': 0.6000000000000001, 'f1': 0.6969696969696971} 

[[ 7 18]
 [ 2 23]] 

0.694, 95.0% Bootstrap confidence interval: [0.540, 0.817] 



In [4]:
"""
2. gender profiling validation twitter only
"""

# first do LR

with open('models/final-refined-set-paper/final-profiling-LR-twitter.pkl', 'rb') as f:
    model_SVM = pickle.load(f)
print(model_SVM, '\n')

test_df = pd.read_csv('acl22-data/final-data/cleaned-data-authorship-profiling/final-refined-set/facebook_celeb_profiling_test.csv')
y_val = test_df['target'].to_numpy(dtype = int)
print(f"There are {str(len(test_df['target'][test_df['target']==0])/len(test_df['target'])*100)[0:5]}% of Female profiles in the dataset. \n")

X = pd.read_csv('acl22-data/final-data/cleaned-data-authorship-profiling/final-refined-set/twitter_celeb_profiling_folded.csv')
X = X[["post","target"]]
feature_label = 'post'

vectorizer = TfidfVectorizer(
    analyzer='word', 
    sublinear_tf=True, 
    strip_accents='unicode',
    ngram_range=(2, 2),
    max_features=10000)

X_train,  y_train = [
        X[feature_label], 
        X["target"] 
      ]
    #print(y_val)
    # redefine Xs with tfidf
X_train_tf = vectorizer.fit_transform(X_train)
X_train_tf = vectorizer.transform(X_train)
print("n_samples train: %d, n_features train: %d" % X_train_tf.shape)
    
X_test_tf = vectorizer.transform(test_df["post"])
print("n_samples val: %d, n_features val: %d" % X_test_tf.shape, '\n')
y_pred = model_SVM.predict(X_test_tf)


accuracy = accuracy_score(y_val, y_pred)
roc_auc = roc_auc_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
    

results = {'accuracy':accuracy, 'roc_auc':roc_auc, 'f1':f1}
print(results, '\n')
print(confusion_matrix(y_val, y_pred), '\n')

confidence_level=0.95
lower, median,upper,f1s=bootstrap(y_val, y_pred, B=1000,confidence_level=confidence_level)
print("%.3f, %s%% Bootstrap confidence interval: [%.3f, %.3f]" % (median, confidence_level*100, lower, upper), '\n')
pd.DataFrame(f1s).T.to_csv('models/final-refined-set-paper/eval_dfs/f1s_profiling_LR_twitter.csv')

# second do SVM

with open('models/final-refined-set-paper/final-profiling-SVM-twitter.pkl', 'rb') as f:
    model_SVM = pickle.load(f)
print('\n', model_SVM, '\n')

test_df = pd.read_csv('acl22-data/final-data/cleaned-data-authorship-profiling/final-refined-set/facebook_celeb_profiling_test.csv')
y_val = test_df['target'].to_numpy(dtype = int)
print(f"There are {str(len(test_df['target'][test_df['target']==0])/len(test_df['target'])*100)[0:5]}% of Female profiles in the dataset. \n")

X = pd.read_csv('acl22-data/final-data/cleaned-data-authorship-profiling/final-refined-set/twitter_celeb_profiling_folded.csv')
X = X[["post","target"]]
feature_label = 'post'

vectorizer = TfidfVectorizer(
    analyzer='word', 
    sublinear_tf=True, 
    strip_accents='unicode',
    ngram_range=(2, 2),
    max_features=10000)

X_train,  y_train = [
        X[feature_label], 
        X["target"] 
      ]
    #print(y_val)
    # redefine Xs with tfidf
X_train_tf = vectorizer.fit_transform(X_train)
X_train_tf = vectorizer.transform(X_train)
print("n_samples train: %d, n_features train: %d" % X_train_tf.shape)
    
X_test_tf = vectorizer.transform(test_df["post"])
print("n_samples val: %d, n_features val: %d" % X_test_tf.shape, '\n')
y_pred = model_SVM.predict(X_test_tf)

accuracy = accuracy_score(y_val, y_pred)
roc_auc = roc_auc_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
    

results = {'accuracy':accuracy, 'roc_auc':roc_auc, 'f1':f1}
print(results, '\n')
print(confusion_matrix(y_val, y_pred), '\n')

confidence_level=0.95
lower, median,upper,f1s=bootstrap(y_val, y_pred, B=1000,confidence_level=confidence_level)
print("%.3f, %s%% Bootstrap confidence interval: [%.3f, %.3f]" % (median, confidence_level*100, lower, upper), '\n')
pd.DataFrame(f1s).T.to_csv('models/final-refined-set-paper/eval_dfs/f1s_profiling_SVM_twitter.csv')



LogisticRegression(class_weight='balanced', penalty='none', solver='saga') 

There are 50.0% of Female profiles in the dataset. 

n_samples train: 372, n_features train: 10000
n_samples val: 50, n_features val: 10000 

{'accuracy': 0.68, 'roc_auc': 0.6800000000000002, 'f1': 0.7142857142857142} 

[[14 11]
 [ 5 20]] 

0.715, 95.0% Bootstrap confidence interval: [0.549, 0.836] 


 SVC(C=1, class_weight='balanced', kernel='sigmoid') 

There are 50.0% of Female profiles in the dataset. 

n_samples train: 372, n_features train: 10000
n_samples val: 50, n_features val: 10000 

{'accuracy': 0.6, 'roc_auc': 0.6, 'f1': 0.6551724137931034} 

[[11 14]
 [ 6 19]] 

0.655, 95.0% Bootstrap confidence interval: [0.500, 0.787] 



In [5]:
"""
3. authorship attribution validation mixed instagram + twitter
"""

# first do LR

with open('models/final-refined-set-paper/final-attribution-LR-mix.pkl', 'rb') as f:
    model_SVM = pickle.load(f)
print(model_SVM, '\n')

test_df = pd.read_csv('acl22-data/intermediate-data/cleaned-data-authorship-attribution/final-refined-set/facebook_celeb_attribution.csv', index_col = 'Unnamed: 0')
test_df[['post', 'target']].to_csv('acl22-data/final-data/cleaned-data-authorship-attribution/final-refined-set/facebook_celeb_attribution_test.csv')

test_df = pd.read_csv('acl22-data/final-data/cleaned-data-authorship-attribution/final-refined-set/facebook_celeb_attribution_test.csv')
y_val = test_df['target'].to_numpy(dtype = int)

X = pd.read_csv('acl22-data/final-data/cleaned-data-authorship-attribution/final-refined-set/mixed_celeb_attribution_folded.csv')
X = X[["post","target"]]
feature_label = 'post'

vectorizer = TfidfVectorizer(
    analyzer='word', 
    sublinear_tf=True, 
    strip_accents='unicode',
    ngram_range=(2, 2),
    max_features=10000)

X_train,  y_train = [
        X[feature_label], 
        X["target"] 
      ]
    #print(y_val)
    # redefine Xs with tfidf
X_train_tf = vectorizer.fit_transform(X_train)
X_train_tf = vectorizer.transform(X_train)
print("n_samples train: %d, n_features train: %d" % X_train_tf.shape)
    
X_test_tf = vectorizer.transform(test_df["post"])
print("n_samples val: %d, n_features val: %d" % X_test_tf.shape, '\n')

X_test_tf_nonsparse = pd.DataFrame.sparse.from_spmatrix(X_test_tf)
    
bert_feats_list = []
test_df = test_df.reset_index(drop=True)
for i in range(0,len(X_test_tf_nonsparse)):
    bert_feats_list.append(pd.DataFrame(get_bert_average_across_text_tokens(string = test_df['post'].loc[i],
                                                                            tokenizer = BertTokenizer.from_pretrained('bert-base-cased'),
                                                                            model_bert = BertModel.from_pretrained('bert-base-cased'))).T)
bert_df = pd.concat(bert_feats_list)    
    
X_test_both = pd.merge(X_test_tf_nonsparse, bert_df, left_index=True, right_index=True, suffixes=('_tfidf', '_bert'))

y_pred = model_SVM.predict(X_test_both)


accuracy = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred, average='micro')

results = {'accuracy':accuracy, 'f1':f1}
print(results, '\n')
print(confusion_matrix(y_val, y_pred), '\n')

confidence_level=0.95
lower, median,upper,f1s=bootstrap(y_val, y_pred, B=1000,confidence_level=confidence_level, multiclass=True)
print("%.3f, %s%% Bootstrap confidence interval: [%.3f, %.3f]" % (median, confidence_level*100, lower, upper), '\n')
pd.DataFrame(f1s).T.to_csv('models/final-refined-set-paper/eval_dfs/f1s_attribution_LR_mix.csv')


# second do SVM

with open('models/final-refined-set-paper/final-attribution-SVM-mix.pkl', 'rb') as f:
    model_SVM = pickle.load(f)
print(model_SVM, '\n')

#test_df = pd.read_csv('acl22-data/intermediate-data/cleaned-data-authorship-attribution/final-refined-set/facebook_celeb_attribution.csv', index_col = 'Unnamed: 0')
#test_df[['post', 'target']].to_csv('acl22-data/final-data/cleaned-data-authorship-attribution/final-refined-set/facebook_celeb_attribution_test.csv')

test_df = pd.read_csv('acl22-data/final-data/cleaned-data-authorship-attribution/final-refined-set/facebook_celeb_attribution_test.csv')
y_val = test_df['target'].to_numpy(dtype = int)

X = pd.read_csv('acl22-data/final-data/cleaned-data-authorship-attribution/final-refined-set/mixed_celeb_attribution_folded.csv')
X = X[["post","target"]]
feature_label = 'post'

vectorizer = TfidfVectorizer(
    analyzer='word', 
    sublinear_tf=True, 
    strip_accents='unicode',
    ngram_range=(2, 2),
    max_features=10000)

X_train,  y_train = [
        X[feature_label], 
        X["target"] 
      ]
    #print(y_val)
    # redefine Xs with tfidf
X_train_tf = vectorizer.fit_transform(X_train)
X_train_tf = vectorizer.transform(X_train)
print("n_samples train: %d, n_features train: %d" % X_train_tf.shape)
    
X_test_tf = vectorizer.transform(test_df["post"])
print("n_samples val: %d, n_features val: %d" % X_test_tf.shape, '\n')

X_test_tf_nonsparse = pd.DataFrame.sparse.from_spmatrix(X_test_tf)
    
bert_feats_list = []
test_df = test_df.reset_index(drop=True)
for i in range(0,len(X_test_tf_nonsparse)):
    bert_feats_list.append(pd.DataFrame(get_bert_average_across_text_tokens(string = test_df['post'].loc[i],
                                                                            tokenizer = BertTokenizer.from_pretrained('bert-base-cased'),
                                                                            model_bert = BertModel.from_pretrained('bert-base-cased'))).T)
bert_df = pd.concat(bert_feats_list)    
    
X_test_both = pd.merge(X_test_tf_nonsparse, bert_df, left_index=True, right_index=True, suffixes=('_tfidf', '_bert'))

y_pred = model_SVM.predict(X_test_both)


accuracy = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred, average='micro')

results = {'accuracy':accuracy, 'f1':f1}
print(results, '\n')
print(confusion_matrix(y_val, y_pred), '\n')

confidence_level=0.95
lower, median,upper,f1s=bootstrap(y_val, y_pred, B=1000,confidence_level=confidence_level, multiclass=True)
print("%.3f, %s%% Bootstrap confidence interval: [%.3f, %.3f]" % (median, confidence_level*100, lower, upper), '\n')
pd.DataFrame(f1s).T.to_csv('models/final-refined-set-paper/eval_dfs/f1s_attribution_SVM_mix.csv')




LogisticRegression(class_weight='balanced', solver='liblinear') 

n_samples train: 372, n_features train: 3737
n_samples val: 60, n_features val: 3737 

{'accuracy': 0.65, 'f1': 0.65} 

[[ 7  9  4]
 [ 0 15  5]
 [ 0  3 17]] 

0.650, 95.0% Bootstrap confidence interval: [0.533, 0.767] 

SVC(C=0.7, class_weight='balanced', kernel='linear') 

n_samples train: 372, n_features train: 3737
n_samples val: 60, n_features val: 3737 

{'accuracy': 0.75, 'f1': 0.75} 

[[11  6  3]
 [ 0 16  4]
 [ 0  2 18]] 

0.750, 95.0% Bootstrap confidence interval: [0.650, 0.850] 



In [6]:
"""
4. authorship attribution validation twitter only
"""

# first do LR

with open('models/final-refined-set-paper/final-attribution-LR-twitter.pkl', 'rb') as f:
    model_SVM = pickle.load(f)
print(model_SVM, '\n')

test_df = pd.read_csv('acl22-data/intermediate-data/cleaned-data-authorship-attribution/final-refined-set/facebook_celeb_attribution.csv', index_col = 'Unnamed: 0')
test_df[['post', 'target']].to_csv('acl22-data/final-data/cleaned-data-authorship-attribution/final-refined-set/facebook_celeb_attribution_test.csv')

test_df = pd.read_csv('acl22-data/final-data/cleaned-data-authorship-attribution/final-refined-set/facebook_celeb_attribution_test.csv')
y_val = test_df['target'].to_numpy(dtype = int)

X = pd.read_csv('acl22-data/final-data/cleaned-data-authorship-attribution/final-refined-set/twitter_celeb_attribution_folded.csv')
X = X[["post","target"]]
feature_label = 'post'

vectorizer = TfidfVectorizer(
    analyzer='word', 
    sublinear_tf=True, 
    strip_accents='unicode',
    ngram_range=(2, 2),
    max_features=10000)

X_train,  y_train = [
        X[feature_label], 
        X["target"] 
      ]
    #print(y_val)
    # redefine Xs with tfidf
X_train_tf = vectorizer.fit_transform(X_train)
X_train_tf = vectorizer.transform(X_train)
print("n_samples train: %d, n_features train: %d" % X_train_tf.shape)
    
X_test_tf = vectorizer.transform(test_df["post"])
print("n_samples val: %d, n_features val: %d" % X_test_tf.shape, '\n')

X_test_tf_nonsparse = pd.DataFrame.sparse.from_spmatrix(X_test_tf)
    
bert_feats_list = []
test_df = test_df.reset_index(drop=True)
for i in range(0,len(X_test_tf_nonsparse)):
    bert_feats_list.append(pd.DataFrame(get_bert_average_across_text_tokens(string = test_df['post'].loc[i],
                                                                            tokenizer = BertTokenizer.from_pretrained('bert-base-cased'),
                                                                            model_bert = BertModel.from_pretrained('bert-base-cased'))).T)
bert_df = pd.concat(bert_feats_list)    
    
X_test_both = pd.merge(X_test_tf_nonsparse, bert_df, left_index=True, right_index=True, suffixes=('_tfidf', '_bert'))

y_pred = model_SVM.predict(X_test_both)


accuracy = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred, average='micro')

results = {'accuracy':accuracy, 'f1':f1}
print(results, '\n')
print(confusion_matrix(y_val, y_pred), '\n')

confidence_level=0.95
lower, median,upper,f1s=bootstrap(y_val, y_pred, B=1000,confidence_level=confidence_level, multiclass=True)
print("%.3f, %s%% Bootstrap confidence interval: [%.3f, %.3f]" % (median, confidence_level*100, lower, upper), '\n')
pd.DataFrame(f1s).T.to_csv('models/final-refined-set-paper/eval_dfs/f1s_attribution_LR_twitter.csv')


# second do SVM

with open('models/final-refined-set-paper/final-attribution-SVM-twitter.pkl', 'rb') as f:
    model_SVM = pickle.load(f)
print(model_SVM, '\n')

#test_df = pd.read_csv('acl22-data/intermediate-data/cleaned-data-authorship-attribution/final-refined-set/facebook_celeb_attribution.csv', index_col = 'Unnamed: 0')
#test_df[['post', 'target']].to_csv('acl22-data/final-data/cleaned-data-authorship-attribution/final-refined-set/facebook_celeb_attribution_test.csv')

test_df = pd.read_csv('acl22-data/final-data/cleaned-data-authorship-attribution/final-refined-set/facebook_celeb_attribution_test.csv')
y_val = test_df['target'].to_numpy(dtype = int)

X = pd.read_csv('acl22-data/final-data/cleaned-data-authorship-attribution/final-refined-set/twitter_celeb_attribution_folded.csv')
X = X[["post","target"]]
feature_label = 'post'

vectorizer = TfidfVectorizer(
    analyzer='word', 
    sublinear_tf=True, 
    strip_accents='unicode',
    ngram_range=(2, 2),
    max_features=10000)

X_train,  y_train = [
        X[feature_label], 
        X["target"] 
      ]
    #print(y_val)
    # redefine Xs with tfidf
X_train_tf = vectorizer.fit_transform(X_train)
X_train_tf = vectorizer.transform(X_train)
print("n_samples train: %d, n_features train: %d" % X_train_tf.shape)
    
X_test_tf = vectorizer.transform(test_df["post"])
print("n_samples val: %d, n_features val: %d" % X_test_tf.shape, '\n')

X_test_tf_nonsparse = pd.DataFrame.sparse.from_spmatrix(X_test_tf)
    
bert_feats_list = []
test_df = test_df.reset_index(drop=True)
for i in range(0,len(X_test_tf_nonsparse)):
    bert_feats_list.append(pd.DataFrame(get_bert_average_across_text_tokens(string = test_df['post'].loc[i],
                                                                            tokenizer = BertTokenizer.from_pretrained('bert-base-cased'),
                                                                            model_bert = BertModel.from_pretrained('bert-base-cased'))).T)
bert_df = pd.concat(bert_feats_list)    
    
X_test_both = pd.merge(X_test_tf_nonsparse, bert_df, left_index=True, right_index=True, suffixes=('_tfidf', '_bert'))

y_pred = model_SVM.predict(X_test_both)


accuracy = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred, average='micro')

results = {'accuracy':accuracy, 'f1':f1}
print(results, '\n')
print(confusion_matrix(y_val, y_pred), '\n')

confidence_level=0.95
lower, median,upper,f1s=bootstrap(y_val, y_pred, B=1000,confidence_level=confidence_level, multiclass=True)
print("%.3f, %s%% Bootstrap confidence interval: [%.3f, %.3f]" % (median, confidence_level*100, lower, upper), '\n')
pd.DataFrame(f1s).T.to_csv('models/final-refined-set-paper/eval_dfs/f1s_attribution_SVM_twitter.csv')




LogisticRegression(class_weight='balanced', solver='newton-cg') 

n_samples train: 372, n_features train: 3193
n_samples val: 60, n_features val: 3193 

{'accuracy': 0.6, 'f1': 0.6} 

[[ 6  8  6]
 [ 1 16  3]
 [ 0  6 14]] 

0.600, 95.0% Bootstrap confidence interval: [0.483, 0.717] 

SVC(C=0.6, class_weight='balanced', kernel='linear') 

n_samples train: 372, n_features train: 3193
n_samples val: 60, n_features val: 3193 

{'accuracy': 0.6166666666666667, 'f1': 0.6166666666666667} 

[[ 8  5  7]
 [ 3 14  3]
 [ 0  5 15]] 

0.617, 95.0% Bootstrap confidence interval: [0.500, 0.733] 



In [7]:
"""
5. hypothesis testing and practical significance

One sided Welch's T-test, which doesn't assume equal variances. Using bootstrap to generate confidence intervals.
Null is that the difference is zero
Alternative is that multi-platform modeling is superior

Cohen's d employed for effect size.
"""

# mix vs. twitter authorship attribution SVM

f1s_AA_SVM_Twitter =pd.read_csv('models/final-refined-set-paper/eval_dfs/f1s_attribution_SVM_twitter.csv', index_col = 'Unnamed: 0')
f1s_AA_SVM_mix = pd.read_csv('models/final-refined-set-paper/eval_dfs/f1s_attribution_SVM_mix.csv', index_col = 'Unnamed: 0')
p_val_list = []
for i in range(0,1000):
    f1s_AA_SVM_mix_bootstrap = f1s_AA_SVM_mix.T[0].sample(frac=1, replace=True).to_list()
    f1s_AA_SVM_Twitter_bootstrap = f1s_AA_SVM_Twitter.T[0].sample(frac=1, replace=True).to_list()
    p_val = stats.ttest_ind(f1s_AA_SVM_mix_bootstrap, f1s_AA_SVM_Twitter_bootstrap, equal_var=False, alternative = 'greater')[1]
    p_val_list.append(p_val)
p_val_list.sort()
lower = p_val_list[25]
median = p_val_list[500]
upper = p_val_list[975]
print(f"Mix vs. twitter authorship attribution SVM: \n      Bonferonni corrected P_val for Welch's T-test: {median*4}, with a 95% confidence interval of [{lower*4},{upper*4}]\n")
c0 = f1s_AA_SVM_mix
c0 = c0.values.tolist()[0]
c1 = f1s_AA_SVM_Twitter
c1 = c1.values.tolist()[0]
cohens_d = (mean(c0) - mean(c1)) / (sqrt((stdev(c0) ** 2 + stdev(c1) ** 2) / 2))
print('      The Cohens d effect size is', cohens_d)


# mix vs. twitter authorship attribution LR

f1s_AA_SVM_Twitter =pd.read_csv('models/final-refined-set-paper/eval_dfs/f1s_attribution_LR_twitter.csv', index_col = 'Unnamed: 0')
f1s_AA_SVM_mix = pd.read_csv('models/final-refined-set-paper/eval_dfs/f1s_attribution_LR_mix.csv', index_col = 'Unnamed: 0')
p_val_list = []
for i in range(0,1000):
    f1s_AA_SVM_mix_bootstrap = f1s_AA_SVM_mix.T[0].sample(frac=1, replace=True).to_list()
    f1s_AA_SVM_Twitter_bootstrap = f1s_AA_SVM_Twitter.T[0].sample(frac=1, replace=True).to_list()
    p_val = stats.ttest_ind(f1s_AA_SVM_mix_bootstrap, f1s_AA_SVM_Twitter_bootstrap, equal_var=False, alternative = 'greater')[1]
    p_val_list.append(p_val)
p_val_list.sort()
lower = p_val_list[25]
median = p_val_list[500]
upper = p_val_list[975]
print(f"Mix vs. twitter authorship attribution LR: \n      Bonferonni corrected P_val for Welch's T-test: {median*4}, with a 95% confidence interval of [{lower*4},{upper*4}]\n")
c0 = f1s_AA_SVM_mix
c0 = c0.values.tolist()[0]
c1 = f1s_AA_SVM_Twitter
c1 = c1.values.tolist()[0]
cohens_d = (mean(c0) - mean(c1)) / (sqrt((stdev(c0) ** 2 + stdev(c1) ** 2) / 2))
print('      The Cohens d effect size is', cohens_d)

# mix vs. twitter gender profiling SVM

f1s_AA_SVM_Twitter =pd.read_csv('models/final-refined-set-paper/eval_dfs/f1s_profiling_SVM_twitter.csv', index_col = 'Unnamed: 0')
f1s_AA_SVM_mix = pd.read_csv('models/final-refined-set-paper/eval_dfs/f1s_profiling_SVM_mix.csv', index_col = 'Unnamed: 0')
p_val_list = []
for i in range(0,1000):
    f1s_AA_SVM_mix_bootstrap = f1s_AA_SVM_mix.T[0].sample(frac=1, replace=True).to_list()
    f1s_AA_SVM_Twitter_bootstrap = f1s_AA_SVM_Twitter.T[0].sample(frac=1, replace=True).to_list()
    p_val = stats.ttest_ind(f1s_AA_SVM_mix_bootstrap, f1s_AA_SVM_Twitter_bootstrap, equal_var=False, alternative = 'greater')[1]
    p_val_list.append(p_val)
p_val_list.sort()
lower = p_val_list[25]
median = p_val_list[500]
upper = p_val_list[975]
print(f"Mix vs. twitter authorship profiling SVM: \n      Bonferonni corrected P_val for Welch's T-test: {median*4}, with a 95% confidence interval of [{lower*4},{upper*4}]\n")
c0 = f1s_AA_SVM_mix
c0 = c0.values.tolist()[0]
c1 = f1s_AA_SVM_Twitter
c1 = c1.values.tolist()[0]
cohens_d = (mean(c0) - mean(c1)) / (sqrt((stdev(c0) ** 2 + stdev(c1) ** 2) / 2))
print('      The Cohens d effect size is', cohens_d)

# mix vs. twitter gender profiling LR

f1s_AA_SVM_Twitter =pd.read_csv('models/final-refined-set-paper/eval_dfs/f1s_profiling_LR_twitter.csv', index_col = 'Unnamed: 0')
f1s_AA_SVM_mix = pd.read_csv('models/final-refined-set-paper/eval_dfs/f1s_profiling_LR_mix.csv', index_col = 'Unnamed: 0')
p_val_list = []
for i in range(0,1000):
    f1s_AA_SVM_mix_bootstrap = f1s_AA_SVM_mix.T[0].sample(frac=1, replace=True).to_list()
    f1s_AA_SVM_Twitter_bootstrap = f1s_AA_SVM_Twitter.T[0].sample(frac=1, replace=True).to_list()
    p_val = stats.ttest_ind(f1s_AA_SVM_mix_bootstrap, f1s_AA_SVM_Twitter_bootstrap, equal_var=False, alternative = 'greater')[1]
    p_val_list.append(p_val)
p_val_list.sort()
lower = p_val_list[25]
median = p_val_list[500]
upper = p_val_list[975]
print(f"Mix vs. twitter authorship profiling LR: \n      Bonferonni corrected P_val for Welch's T-test: {median*4}, with a 95% confidence interval of [{lower*4},{upper*4}]\n")
c0 = f1s_AA_SVM_mix
c0 = c0.values.tolist()[0]
c1 = f1s_AA_SVM_Twitter
c1 = c1.values.tolist()[0]
cohens_d = (mean(c0) - mean(c1)) / (sqrt((stdev(c0) ** 2 + stdev(c1) ** 2) / 2))
print('      The Cohens d effect size is', cohens_d)


Mix vs. twitter authorship attribution SVM: 
      Bonferonni corrected P_val for Welch's T-test: 0.0, with a 95% confidence interval of [0.0,0.0]

      The Cohens d effect size is 2.2621726227507217
Mix vs. twitter authorship attribution LR: 
      Bonferonni corrected P_val for Welch's T-test: 1.9881530707118724e-71, with a 95% confidence interval of [1.74065854739331e-85,1.1882912739943515e-57]

      The Cohens d effect size is 0.8307029048234137
Mix vs. twitter authorship profiling SVM: 
      Bonferonni corrected P_val for Welch's T-test: 7.07880319054289e-30, with a 95% confidence interval of [5.973494804257549e-40,3.7500356952609115e-21]

      The Cohens d effect size is 0.5182348084916546
Mix vs. twitter authorship profiling LR: 
      Bonferonni corrected P_val for Welch's T-test: 2.1030858084979975e-07, with a 95% confidence interval of [2.2564229801414682e-12,0.0012105692679097546]

      The Cohens d effect size is 0.238442055551052
