In [None]:
### install boostsa if it still not installed 
### pip install boostsa

In [47]:
from boostsa import Bootstrap
import pickle
import pandas as pd
import numpy as np
import itertools
import os
import math
import shutil

# Boostsa package background
Article on creating a package https://aclanthology.org/2022.acl-demo.12.pdf
Documentation: https://boostsa.readthedocs.io/en/latest/guide.html#getting-started

Long story short:
The performance gap between different models can be modeled as a ranom variable, and when variable follows normal distribution you can test student t-test for assesing significance in performing difference. In case of NLP tasks normal distrubution assumption may not hold. Therefore, randomized, sample-based, non-parametric tests such as boot- strap sampling are better suited for NLP.

Boostraping: random test sets are sampled with replacement from the whole test set, perfomance difference between whole test set and sub-sampled set is computed. The difference is considered significant if the difference in the sub-sample  is at least twice as large as the overall difference observed in the original test sample.  The p-value, which is a measure of significance, is calculated by counting how many times the δ_sub-sample is at least twice as large as δ_sample. This count is then divided by the total number of iterations (bootstrap samples) to get the p-value.

# Downloading data and preprocessing 

In [129]:
### downloading data it
path = '/g100_work/IscrC_mental/data/user_classification/trained_models/it/'

# gold labels
file = 'gold_labels_test.pkl'
with open(path+file, 'rb') as file:
    gold_labels_it = pickle.load(file)

# M3 bio
file = 'm3_test_age_gender.csv'
m3_bio_it = pd.read_csv(path+file)
    
# M3 bio + pictures
# m3_bio_picture_it = pd.read_pickle('/g100_work/IscrC_mental/data/user_classification/m3_scores_bio_image.pkl')
file = 'm3_bio_image_test_age_gender.csv'
m3_bio_picture_it = pd.read_csv(path+file)


# XLM bio
XLM_bio_gender_it = pd.read_csv('/g100_work/IscrC_mental/data/user_classification/trained_models/it/xlm_bio_only_test_gender.csv')
XLM_bio_age_it = pd.read_csv('/g100_work/IscrC_mental/data/user_classification/trained_models/it/xlm_bio_only_test_age.csv')

# XLM bios  + pictures

XLM_bio_picture_age_it = pd.DataFrame(np.load('/g100_work/IscrC_mental/data/user_classification/trained_models/age/XLM_probs_age_test.npy'))
XLM_bio_picture_age_it.columns = ['user_id', 'pred_age_0_19_prob', 'pred_age_20_29_prob', 'pred_age_30_39_prob', 'pred_age_40_100_prob']
XLM_bio_picture_age_it.user_id = XLM_bio_picture_age_it.user_id.astype(float) 

XLM_bio_picture_gender_it = pd.DataFrame(np.load('/g100_work/IscrC_mental/data/user_classification/trained_models/gender/XLM_probs_gender_test.npy'))
XLM_bio_picture_gender_it.columns = ['user_id', 'p_is_female', 'p_is_male']
XLM_bio_picture_gender_it.user_id = XLM_bio_picture_age_it.user_id.astype(float)


# FLAN-T5 bio
file = 'flan_bio_only_test_gender.csv'
flan_bio_gender = pd.read_csv(path+file)

file = 'flan_bio_only_test_age.csv'
flan_bio_age = pd.read_csv(path+file)

# FLAN-T5 bio + tweets

file = 'flan_bio_tweets_test_gender.csv'
flan_bio_tweets_gender = pd.read_csv(path+file)

file = 'flan_bio_tweets_test_age.csv'
flan_bio_tweets_age =  pd.read_csv(path+file)


# GPT 3.5 bio

file = 'gpt35_bio_test_age.csv'
gpt_bio_age = pd.read_csv(path+file)

file = 'gpt35_bio_test_gender.csv'
gpt_bio_gender = pd.read_csv(path+file)


# GPT 3.5 bio + picture

file =  'gpt35_bio_tweets_test_age.csv'
gpt_bio_tweets_age = pd.read_csv(path+file)

file = 'gpt35_bio_tweets_test_gender.csv'
gpt_bio_tweets_gender = pd.read_csv(path+file)


# CV how = 'outer'
file = 'cv_test.pkl'
cv_it = pd.read_pickle(path+file)


In [130]:
### Checking if all model's results are mergable (for gender predictions)

gold_labels_it['user_id'] = gold_labels_it['user_id'].astype(float)
gender_it =  gold_labels_it[['user_id','gold_gender']]
print(len(gender_it))

m3_bio_it['user_id'] = m3_bio_it['user_id'].astype(float)
gender_it = gender_it.merge(m3_bio_it[['user_id','pred_male']], on='user_id', how = 'outer')
gender_it.rename(columns={'pred_male': 'M3_bio_male'}, inplace=True)
print(len(gender_it))

m3_bio_picture_it['user_id'] = m3_bio_picture_it['user_id'].astype(float)
gender_it = gender_it.merge(m3_bio_picture_it[['user_id', 'pred_male']], on = 'user_id',  how = 'outer')
gender_it.rename(columns={'pred_male': 'M3_bio_picture_male'}, inplace=True)
print(len(gender_it))


gender_it = gender_it.merge(XLM_bio_gender_it[['user_id', 'pred_male']], on = 'user_id',  how = 'outer')
gender_it.rename(columns={'pred_male': 'XLM_bio_male'}, inplace=True)
print(len(gender_it))


XLM_bio_picture_gender_it['XLM_bio_picture_male'] = XLM_bio_picture_gender_it.apply(
    lambda row: 1 if row['p_is_male'] > row['p_is_female'] else 0, axis=1)
gender_it = gender_it.merge(XLM_bio_picture_gender_it[['user_id', 'XLM_bio_picture_male']], on = 'user_id',  how = 'outer')
print(len(gender_it))

flan_bio_gender['flan_bio_male'] =  flan_bio_gender.apply(lambda row: 1 if row['prediction']=='male' else 0, axis = 1)
gender_it = gender_it.merge(flan_bio_gender[['user_id', 'flan_bio_male']], on = 'user_id',  how = 'outer')
print(len(gender_it)) 


flan_bio_tweets_gender['faln_bio_tw_male'] =  flan_bio_tweets_gender.apply(lambda row: 1 if row['prediction']=='male' else 0, axis = 1)
gender_it = gender_it.merge(flan_bio_tweets_gender[['user_id', 'faln_bio_tw_male']], on = 'user_id',  how = 'outer')
print(len(gender_it))


gpt_bio_gender['gpt_bio_male'] =  gpt_bio_gender.apply(lambda row: 1 if row['prediction']=='male' else 0, axis = 1)
gender_it = gender_it.merge(gpt_bio_gender[['user_id', 'gpt_bio_male']], on = 'user_id', how = 'outer' )
print(len(gender_it))

gpt_bio_tweets_gender['gpt_bio_tw_male'] =  gpt_bio_tweets_gender.apply(lambda row: 1 if row['prediction']=='male' else 0, axis = 1)
gender_it = gender_it.merge(gpt_bio_tweets_gender[['user_id', 'gpt_bio_tw_male']], on = 'user_id', how = 'outer' )
print(len(gender_it))

cv_it['user_id'] = cv_it['user_id'].astype(float)
cv_it['cv_male'] = cv_it.apply(lambda row: 1 if row['pred_gender_label']=='Male' else 0, axis = 1)
gender_it = gender_it.merge(cv_it[['user_id', 'cv_male']], on = 'user_id',  how = 'outer')
print(len(gender_it))

gender_it['gold_male'] = gender_it.apply(lambda row: 1 if row['gold_gender']=='male' else 0, axis=1)
gender_it = gender_it.drop(['gold_gender'], axis = 1)
print(len(gender_it))

1119
1119
1119
1151
1151
1152
1152
1152
1152
1171
1171


In [131]:
### GENDER IT INNER
gold_labels_it['user_id'] = gold_labels_it['user_id'].astype(float)
gender_it =  gold_labels_it[['user_id','gold_gender']]
print(len(gender_it))

m3_bio_it['user_id'] = m3_bio_it['user_id'].astype(float)
gender_it = gender_it.merge(m3_bio_it[['user_id','pred_male']], on='user_id', how = 'inner')
gender_it.rename(columns={'pred_male': 'M3_bio_male'}, inplace=True)
print(len(gender_it))

m3_bio_picture_it['user_id'] = m3_bio_picture_it['user_id'].astype(float)
gender_it = gender_it.merge(m3_bio_picture_it[['user_id', 'pred_male']], on = 'user_id',  how = 'inner')
gender_it.rename(columns={'pred_male': 'M3_bio_picture_male'}, inplace=True)
gender_it['M3_bio_picture_male'] = gender_it['M3_bio_picture_male'].astype(int)

print(len(gender_it))

gender_it = gender_it.merge(XLM_bio_gender_it[['user_id', 'pred_male']], on = 'user_id',  how = 'inner')
gender_it.rename(columns={'pred_male': 'XLM_bio_male'}, inplace=True)
print(len(gender_it))

XLM_bio_picture_gender_it['XLM_bio_picture_male'] = XLM_bio_picture_gender_it.apply(
    lambda row: 1 if row['p_is_male'] > row['p_is_female'] else 0, axis=1)
gender_it = gender_it.merge(XLM_bio_picture_gender_it[['user_id', 'XLM_bio_picture_male']], on = 'user_id',  how = 'inner')
print(len(gender_it))

flan_bio_gender['flan_bio_male'] =  flan_bio_gender.apply(lambda row: 1 if row['prediction']=='male' else 0, axis = 1)
gender_it = gender_it.merge(flan_bio_gender[['user_id', 'flan_bio_male']], on = 'user_id',  how = 'inner')
print(len(gender_it)) 


flan_bio_tweets_gender['faln_bio_tw_male'] =  flan_bio_tweets_gender.apply(lambda row: 1 if row['prediction']=='male' else 0, axis = 1)
gender_it = gender_it.merge(flan_bio_tweets_gender[['user_id', 'faln_bio_tw_male']], on = 'user_id',  how = 'inner')
print(len(gender_it))


gpt_bio_gender['gpt_bio_male'] =  gpt_bio_gender.apply(lambda row: 1 if row['prediction']=='male' else 0, axis = 1)
gender_it = gender_it.merge(gpt_bio_gender[['user_id', 'gpt_bio_male']], on = 'user_id',  how = 'inner')
print(len(gender_it))


gpt_bio_tweets_gender['gpt_bio_tw_male'] =  gpt_bio_tweets_gender.apply(lambda row: 1 if row['prediction']=='male' else 0, axis = 1)
gender_it = gender_it.merge(gpt_bio_tweets_gender[['user_id', 'gpt_bio_tw_male']], on = 'user_id',  how = 'inner')
print(len(gender_it))

cv_it['user_id'] = cv_it['user_id'].astype(float)
cv_it['cv_male'] = cv_it.apply(lambda row: 1 if row['pred_gender_label']=='Male' else 0, axis = 1)
gender_it = gender_it.merge(cv_it[['user_id', 'cv_male']], on = 'user_id',  how = 'inner')
print(len(gender_it))

gender_it['gold_male'] = gender_it.apply(lambda row: 1 if row['gold_gender']=='male' else 0, axis=1)
gender_it = gender_it.drop(['gold_gender'], axis = 1)
print(len(gender_it))

1119
1119
1119
1087
1087
1086
1086
1086
1086
1086
1086


In [132]:
### len of each df
print(f'gold labels {len(gold_labels_it)}')
print(f'm3 bio it {len(m3_bio_it)}')
print(f'm3_bio_picture_it {len(m3_bio_picture_it)}')
print(f'XLM_bio_gender_it {len(XLM_bio_gender_it)}')
print(f'XLM_bio_picture_gender_it {len(XLM_bio_picture_gender_it)}')
print(f'cv_it {len(cv_it)}')
print(f'flan bio {len(flan_bio_gender)}')
print(f'flan bio tweets {len(flan_bio_tweets_gender)}')
print(f'gpt bio {len(gpt_bio_gender)}')
print(f'gpt tweets bio {len(gpt_bio_tweets_gender)}')

gold labels 1119
m3 bio it 1119
m3_bio_picture_it 1119
XLM_bio_gender_it 1119
XLM_bio_picture_gender_it 1119
cv_it 1138
flan bio 1119
flan bio tweets 1119
gpt bio 1119
gpt tweets bio 1119


In [133]:
### lets merge one with another and see results
dataframes = [gold_labels_it, m3_bio_it, m3_bio_picture_it, XLM_bio_gender_it, XLM_bio_picture_gender_it, cv_it, flan_bio_gender, flan_bio_tweets_gender, gpt_bio_gender, gpt_bio_tweets_gender]

merged_lengths = []


for i in range(len(dataframes)):
    for j in range(i + 1, len(dataframes)):
        merged_df = pd.merge(dataframes[i], dataframes[j], on='user_id', how='inner') 
        length = len(merged_df)
        print(f'Merged DataFrame {i} with DataFrame {j}: Length = {length}')
        merged_lengths.append(length)

result_df = pd.DataFrame({
    'Pair': [f'DF {i} with DF {j}' for i in range(len(dataframes)) for j in range(i + 1, len(dataframes))],
    'Merged Length': merged_lengths
})

Merged DataFrame 0 with DataFrame 1: Length = 1119
Merged DataFrame 0 with DataFrame 2: Length = 1119
Merged DataFrame 0 with DataFrame 3: Length = 1087
Merged DataFrame 0 with DataFrame 4: Length = 1119
Merged DataFrame 0 with DataFrame 5: Length = 1119
Merged DataFrame 0 with DataFrame 6: Length = 1086
Merged DataFrame 0 with DataFrame 7: Length = 1086
Merged DataFrame 0 with DataFrame 8: Length = 1086
Merged DataFrame 0 with DataFrame 9: Length = 1086
Merged DataFrame 1 with DataFrame 2: Length = 1119
Merged DataFrame 1 with DataFrame 3: Length = 1087
Merged DataFrame 1 with DataFrame 4: Length = 1119
Merged DataFrame 1 with DataFrame 5: Length = 1119
Merged DataFrame 1 with DataFrame 6: Length = 1086
Merged DataFrame 1 with DataFrame 7: Length = 1086
Merged DataFrame 1 with DataFrame 8: Length = 1086
Merged DataFrame 1 with DataFrame 9: Length = 1086
Merged DataFrame 2 with DataFrame 3: Length = 1087
Merged DataFrame 2 with DataFrame 4: Length = 1119
Merged DataFrame 2 with DataFra

# ITALIAN, GENDER
Let's calculate significance test on coinciding users

In [135]:
column_names = gender_it.columns.tolist()
columns_to_drop = ['user_id', 'gold_male']
models = [col for col in column_names if col not in columns_to_drop]
models

['M3_bio_male',
 'M3_bio_picture_male',
 'XLM_bio_male',
 'XLM_bio_picture_male',
 'flan_bio_male',
 'faln_bio_tw_male',
 'gpt_bio_male',
 'gpt_bio_tw_male',
 'cv_male']

In [137]:
### getting model combinations for comparison 
model_combinations = list(itertools.combinations(models, 2))
for combo in model_combinations:
    print(combo)

('M3_bio_male', 'M3_bio_picture_male')
('M3_bio_male', 'XLM_bio_male')
('M3_bio_male', 'XLM_bio_picture_male')
('M3_bio_male', 'flan_bio_male')
('M3_bio_male', 'faln_bio_tw_male')
('M3_bio_male', 'gpt_bio_male')
('M3_bio_male', 'gpt_bio_tw_male')
('M3_bio_male', 'cv_male')
('M3_bio_picture_male', 'XLM_bio_male')
('M3_bio_picture_male', 'XLM_bio_picture_male')
('M3_bio_picture_male', 'flan_bio_male')
('M3_bio_picture_male', 'faln_bio_tw_male')
('M3_bio_picture_male', 'gpt_bio_male')
('M3_bio_picture_male', 'gpt_bio_tw_male')
('M3_bio_picture_male', 'cv_male')
('XLM_bio_male', 'XLM_bio_picture_male')
('XLM_bio_male', 'flan_bio_male')
('XLM_bio_male', 'faln_bio_tw_male')
('XLM_bio_male', 'gpt_bio_male')
('XLM_bio_male', 'gpt_bio_tw_male')
('XLM_bio_male', 'cv_male')
('XLM_bio_picture_male', 'flan_bio_male')
('XLM_bio_picture_male', 'faln_bio_tw_male')
('XLM_bio_picture_male', 'gpt_bio_male')
('XLM_bio_picture_male', 'gpt_bio_tw_male')
('XLM_bio_picture_male', 'cv_male')
('flan_bio_male', 

In [138]:
### making output table to get results 
row_names = models
column_names = models
output = pd.DataFrame(index=row_names, columns=column_names)

In [144]:
### calculating boostrapped significance tests 
os.mkdir('gender_results')
boot = Bootstrap()

for i in model_combinations:
    file_name = f'{i[0]}_vs_{i[1]}'
    file_path = 'gender_results/'+file_name
    os.mkdir(file_path)
    boot = Bootstrap(save_outcomes=True, dir_out=file_path)
    boot.test(targs=gender_it['gold_male'].tolist(), h0_preds=gender_it[i[0]].tolist(), h1_preds=gender_it[i[1]].tolist(), n_loops=1000, sample_size=.2, verbose=True)
    
    new_file_path = file_path+'/results.tsv'
    results = pd.read_csv(new_file_path, sep='\t')
    
    diff = "" if math.isnan(results['d_f1'][1]) else str(results['d_f1'][1])
    signif = '' if pd.isna(results['s_f1'][1]) else str(results['s_f1'][1])
    output.loc[i[0],i[1]] = diff + signif
    ## adding F-scores
    output.loc[i[0],i[0]] = str(results['f1'][0])
    output.loc[i[1],i[1]] = str(results['f1'][1])


### interpretating the results: 
### count sample diff means that in 33,63% of times statistics we asessing was twice as large as the same ststistics on whole dataset
### Next to each count is a p value (p < 0.015 for F1 score, for instance), which indicates the probability of observing such a difference 
### if the null hypothesis were true. 
### H0 - baseline estimates are better then experemental model, H1 - experimental that beats baseline
### p-value (typically <0.05) suggests that the observed difference is statistically significant and not due to random chance.

data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 61 perc 5.62%', 'class 1 freq 1025 perc 94.38%']
h1 preds count: ['class 0 freq 251 perc 23.11%', 'class 1 freq 835 perc 76.89%']
F-measure...... - h0: 0.5051  - h1: 0.8114  - diff: 0.3063
precision...... - h0: 0.7281  - h1: 0.8745  - diff: 0.1464
recall......... - h0: 0.5527  - h1: 0.7902  - diff: 0.2375
accuracy....... - h0: 0.6759  - h1: 0.8435  - diff: 0.1676


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 122.34it/s]



count sample diff f1   is twice tot diff f1....... 0    / 1000    p < 0.0    [38;5;9m**[0m
count sample diff prec is twice tot diff prec..... 13   / 1000    p < 0.013  [38;5;9m*[0m
count sample diff rec  is twice tot diff rec ..... 0    / 1000    p < 0.0    [38;5;9m**[0m
count sample diff acc  is twice tot diff acc...... 0    / 1000    p < 0.0    [38;5;9m**[0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 61 perc 5.62%', 'class 1 freq 1025 perc 94.38%']
h1 preds count: ['class 0 freq 164 perc 15.10%', 'class 1 freq 922 perc 84.90%']
F-measure...... - h0: 0.5051  - h1: 0.6364  - diff: 0.1313
precision...... - h0: 0.7281  - h1: 0.7426  - diff: 0.0145
recall......... - h0: 0.5527  - h1: 0.6356  - diff: 0.0829
accuracy....... - h0: 0.6759  - h1: 0.7247  - diff: 0.0488


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 122.69it/s]



count sample diff f1   is twice tot diff f1....... 0    / 1000    p < 0.0    [38;5;9m**[0m
count sample diff prec is twice tot diff prec..... 371  / 1000    p < 0.371  
count sample diff rec  is twice tot diff rec ..... 0    / 1000    p < 0.0    [38;5;9m**[0m
count sample diff acc  is twice tot diff acc...... 14   / 1000    p < 0.014  [38;5;9m*[0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 61 perc 5.62%', 'class 1 freq 1025 perc 94.38%']
h1 preds count: ['class 0 freq 361 perc 33.24%', 'class 1 freq 725 perc 66.76%']
F-measure...... - h0: 0.5051  - h1: 0.8715  - diff: 0.3664
precision...... - h0: 0.7281  - h1: 0.8783  - diff: 0.1502
recall......... - h0: 0.5527  - h1: 0.8660  - diff: 0.3133
accuracy....... - h0: 0.6759  - h1: 0.8840  - diff: 0.2081


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 122.16it/s]



count sample diff f1   is twice tot diff f1....... 0    / 1000    p < 0.0    [38;5;9m**[0m
count sample diff prec is twice tot diff prec..... 15   / 1000    p < 0.015  [38;5;9m*[0m
count sample diff rec  is twice tot diff rec ..... 0    / 1000    p < 0.0    [38;5;9m**[0m
count sample diff acc  is twice tot diff acc...... 0    / 1000    p < 0.0    [38;5;9m**[0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 61 perc 5.62%', 'class 1 freq 1025 perc 94.38%']
h1 preds count: ['class 0 freq 190 perc 17.50%', 'class 1 freq 896 perc 82.50%']
F-measure...... - h0: 0.5051  - h1: 0.5008  - diff: -0.0043
precision...... - h0: 0.7281  - h1: 0.5296  - diff: -0.1985
recall......... - h0: 0.5527  - h1: 0.5187  - diff: -0.0340
accuracy....... - h0: 0.6759  - h1: 0.6105  - diff: -0.0654


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 122.40it/s]



count sample diff f1   is twice tot diff f1....... 543  / 1000    p < 0.543  
count sample diff prec is twice tot diff prec..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff rec  is twice tot diff rec ..... 850  / 1000    p < 0.85   
count sample diff acc  is twice tot diff acc...... 990  / 1000    p < 0.99   [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 61 perc 5.62%', 'class 1 freq 1025 perc 94.38%']
h1 preds count: ['class 0 freq 444 perc 40.88%', 'class 1 freq 642 perc 59.12%']
F-measure...... - h0: 0.5051  - h1: 0.5410  - diff: 0.0359
precision...... - h0: 0.7281  - h1: 0.5415  - diff: -0.1866
recall......... - h0: 0.5527  - h1: 0.5437  - diff: -0.0090
accuracy....... - h0: 0.6759  - h1: 0.5663  - diff: -0.1096


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 122.50it/s]



count sample diff f1   is twice tot diff f1....... 236  / 1000    p < 0.236  
count sample diff prec is twice tot diff prec..... 998  / 1000    p < 0.998  [38;5;8m![0m
count sample diff rec  is twice tot diff rec ..... 609  / 1000    p < 0.609  
count sample diff acc  is twice tot diff acc...... 988  / 1000    p < 0.988  [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 61 perc 5.62%', 'class 1 freq 1025 perc 94.38%']
h1 preds count: ['class 0 freq 187 perc 17.22%', 'class 1 freq 899 perc 82.78%']
F-measure...... - h0: 0.5051  - h1: 0.5098  - diff: 0.0047
precision...... - h0: 0.7281  - h1: 0.5432  - diff: -0.1849
recall......... - h0: 0.5527  - h1: 0.5268  - diff: -0.0259
accuracy....... - h0: 0.6759  - h1: 0.6188  - diff: -0.0571


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 122.53it/s]



count sample diff f1   is twice tot diff f1....... 440  / 1000    p < 0.44   
count sample diff prec is twice tot diff prec..... 994  / 1000    p < 0.994  [38;5;8m![0m
count sample diff rec  is twice tot diff rec ..... 771  / 1000    p < 0.771  
count sample diff acc  is twice tot diff acc...... 965  / 1000    p < 0.965  [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 61 perc 5.62%', 'class 1 freq 1025 perc 94.38%']
h1 preds count: ['class 0 freq 524 perc 48.25%', 'class 1 freq 562 perc 51.75%']
F-measure...... - h0: 0.5051  - h1: 0.5528  - diff: 0.0477
precision...... - h0: 0.7281  - h1: 0.5595  - diff: -0.1686
recall......... - h0: 0.5527  - h1: 0.5648  - diff: 0.0121
accuracy....... - h0: 0.6759  - h1: 0.5645  - diff: -0.1114


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 122.49it/s]



count sample diff f1   is twice tot diff f1....... 166  / 1000    p < 0.166  
count sample diff prec is twice tot diff prec..... 998  / 1000    p < 0.998  [38;5;8m![0m
count sample diff rec  is twice tot diff rec ..... 402  / 1000    p < 0.402  
count sample diff acc  is twice tot diff acc...... 992  / 1000    p < 0.992  [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 61 perc 5.62%', 'class 1 freq 1025 perc 94.38%']
h1 preds count: ['class 0 freq 240 perc 22.10%', 'class 1 freq 846 perc 77.90%']
F-measure...... - h0: 0.5051  - h1: 0.7096  - diff: 0.2045
precision...... - h0: 0.7281  - h1: 0.7634  - diff: 0.0353
recall......... - h0: 0.5527  - h1: 0.6977  - diff: 0.1450
accuracy....... - h0: 0.6759  - h1: 0.7615  - diff: 0.0856


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 122.52it/s]



count sample diff f1   is twice tot diff f1....... 0    / 1000    p < 0.0    [38;5;9m**[0m
count sample diff prec is twice tot diff prec..... 305  / 1000    p < 0.305  
count sample diff rec  is twice tot diff rec ..... 0    / 1000    p < 0.0    [38;5;9m**[0m
count sample diff acc  is twice tot diff acc...... 5    / 1000    p < 0.005  [38;5;9m**[0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 251 perc 23.11%', 'class 1 freq 835 perc 76.89%']
h1 preds count: ['class 0 freq 164 perc 15.10%', 'class 1 freq 922 perc 84.90%']
F-measure...... - h0: 0.8114  - h1: 0.6364  - diff: -0.1750
precision...... - h0: 0.8745  - h1: 0.7426  - diff: -0.1319
recall......... - h0: 0.7902  - h1: 0.6356  - diff: -0.1546
accuracy....... - h0: 0.8435  - h1: 0.7247  - diff: -0.1188


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 122.37it/s]



count sample diff f1   is twice tot diff f1....... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff prec is twice tot diff prec..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff rec  is twice tot diff rec ..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff acc  is twice tot diff acc...... 1000 / 1000    p < 1.0    [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 251 perc 23.11%', 'class 1 freq 835 perc 76.89%']
h1 preds count: ['class 0 freq 361 perc 33.24%', 'class 1 freq 725 perc 66.76%']
F-measure...... - h0: 0.8114  - h1: 0.8715  - diff: 0.0601
precision...... - h0: 0.8745  - h1: 0.8783  - diff: 0.0038
recall......... - h0: 0.7902  - h1: 0.8660  - diff: 0.0758
accuracy....... - h0: 0.8435  - h1: 0.8840  - diff: 0.0405


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 122.34it/s]



count sample diff f1   is twice tot diff f1....... 50   / 1000    p < 0.05   
count sample diff prec is twice tot diff prec..... 438  / 1000    p < 0.438  
count sample diff rec  is twice tot diff rec ..... 21   / 1000    p < 0.021  [38;5;9m*[0m
count sample diff acc  is twice tot diff acc...... 87   / 1000    p < 0.087  
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 251 perc 23.11%', 'class 1 freq 835 perc 76.89%']
h1 preds count: ['class 0 freq 190 perc 17.50%', 'class 1 freq 896 perc 82.50%']
F-measure...... - h0: 0.8114  - h1: 0.5008  - diff: -0.3106
precision...... - h0: 0.8745  - h1: 0.5296  - diff: -0.3449
recall......... - h0: 0.7902  - h1: 0.5187  - diff: -0.2715
accuracy....... - h0: 0.8435  - h1: 0.6105  - diff: -0.2330


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 122.49it/s]



count sample diff f1   is twice tot diff f1....... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff prec is twice tot diff prec..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff rec  is twice tot diff rec ..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff acc  is twice tot diff acc...... 1000 / 1000    p < 1.0    [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 251 perc 23.11%', 'class 1 freq 835 perc 76.89%']
h1 preds count: ['class 0 freq 444 perc 40.88%', 'class 1 freq 642 perc 59.12%']
F-measure...... - h0: 0.8114  - h1: 0.5410  - diff: -0.2704
precision...... - h0: 0.8745  - h1: 0.5415  - diff: -0.3330
recall......... - h0: 0.7902  - h1: 0.5437  - diff: -0.2465
accuracy....... - h0: 0.8435  - h1: 0.5663  - diff: -0.2772


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 122.28it/s]



count sample diff f1   is twice tot diff f1....... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff prec is twice tot diff prec..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff rec  is twice tot diff rec ..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff acc  is twice tot diff acc...... 1000 / 1000    p < 1.0    [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 251 perc 23.11%', 'class 1 freq 835 perc 76.89%']
h1 preds count: ['class 0 freq 187 perc 17.22%', 'class 1 freq 899 perc 82.78%']
F-measure...... - h0: 0.8114  - h1: 0.5098  - diff: -0.3016
precision...... - h0: 0.8745  - h1: 0.5432  - diff: -0.3313
recall......... - h0: 0.7902  - h1: 0.5268  - diff: -0.2634
accuracy....... - h0: 0.8435  - h1: 0.6188  - diff: -0.2247


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 122.15it/s]



count sample diff f1   is twice tot diff f1....... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff prec is twice tot diff prec..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff rec  is twice tot diff rec ..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff acc  is twice tot diff acc...... 1000 / 1000    p < 1.0    [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 251 perc 23.11%', 'class 1 freq 835 perc 76.89%']
h1 preds count: ['class 0 freq 524 perc 48.25%', 'class 1 freq 562 perc 51.75%']
F-measure...... - h0: 0.8114  - h1: 0.5528  - diff: -0.2586
precision...... - h0: 0.8745  - h1: 0.5595  - diff: -0.3150
recall......... - h0: 0.7902  - h1: 0.5648  - diff: -0.2254
accuracy....... - h0: 0.8435  - h1: 0.5645  - diff: -0.2790


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 121.88it/s]



count sample diff f1   is twice tot diff f1....... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff prec is twice tot diff prec..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff rec  is twice tot diff rec ..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff acc  is twice tot diff acc...... 1000 / 1000    p < 1.0    [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 251 perc 23.11%', 'class 1 freq 835 perc 76.89%']
h1 preds count: ['class 0 freq 240 perc 22.10%', 'class 1 freq 846 perc 77.90%']
F-measure...... - h0: 0.8114  - h1: 0.7096  - diff: -0.1018
precision...... - h0: 0.8745  - h1: 0.7634  - diff: -0.1111
recall......... - h0: 0.7902  - h1: 0.6977  - diff: -0.0925
accuracy....... - h0: 0.8435  - h1: 0.7615  - diff: -0.0820


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 121.92it/s]



count sample diff f1   is twice tot diff f1....... 996  / 1000    p < 0.996  [38;5;8m![0m
count sample diff prec is twice tot diff prec..... 996  / 1000    p < 0.996  [38;5;8m![0m
count sample diff rec  is twice tot diff rec ..... 994  / 1000    p < 0.994  [38;5;8m![0m
count sample diff acc  is twice tot diff acc...... 999  / 1000    p < 0.999  [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 164 perc 15.10%', 'class 1 freq 922 perc 84.90%']
h1 preds count: ['class 0 freq 361 perc 33.24%', 'class 1 freq 725 perc 66.76%']
F-measure...... - h0: 0.6364  - h1: 0.8715  - diff: 0.2351
precision...... - h0: 0.7426  - h1: 0.8783  - diff: 0.1357
recall......... - h0: 0.6356  - h1: 0.8660  - diff: 0.2304
accuracy....... - h0: 0.7247  - h1: 0.8840  - diff: 0.1593


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 122.02it/s]



count sample diff f1   is twice tot diff f1....... 0    / 1000    p < 0.0    [38;5;9m**[0m
count sample diff prec is twice tot diff prec..... 1    / 1000    p < 0.001  [38;5;9m**[0m
count sample diff rec  is twice tot diff rec ..... 0    / 1000    p < 0.0    [38;5;9m**[0m
count sample diff acc  is twice tot diff acc...... 0    / 1000    p < 0.0    [38;5;9m**[0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 164 perc 15.10%', 'class 1 freq 922 perc 84.90%']
h1 preds count: ['class 0 freq 190 perc 17.50%', 'class 1 freq 896 perc 82.50%']
F-measure...... - h0: 0.6364  - h1: 0.5008  - diff: -0.1356
precision...... - h0: 0.7426  - h1: 0.5296  - diff: -0.2130
recall......... - h0: 0.6356  - h1: 0.5187  - diff: -0.1169
accuracy....... - h0: 0.7247  - h1: 0.6105  - diff: -0.1142


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 122.10it/s]



count sample diff f1   is twice tot diff f1....... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff prec is twice tot diff prec..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff rec  is twice tot diff rec ..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff acc  is twice tot diff acc...... 1000 / 1000    p < 1.0    [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 164 perc 15.10%', 'class 1 freq 922 perc 84.90%']
h1 preds count: ['class 0 freq 444 perc 40.88%', 'class 1 freq 642 perc 59.12%']
F-measure...... - h0: 0.6364  - h1: 0.5410  - diff: -0.0954
precision...... - h0: 0.7426  - h1: 0.5415  - diff: -0.2011
recall......... - h0: 0.6356  - h1: 0.5437  - diff: -0.0919
accuracy....... - h0: 0.7247  - h1: 0.5663  - diff: -0.1584


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 121.79it/s]



count sample diff f1   is twice tot diff f1....... 976  / 1000    p < 0.976  [38;5;8m![0m
count sample diff prec is twice tot diff prec..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff rec  is twice tot diff rec ..... 984  / 1000    p < 0.984  [38;5;8m![0m
count sample diff acc  is twice tot diff acc...... 1000 / 1000    p < 1.0    [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 164 perc 15.10%', 'class 1 freq 922 perc 84.90%']
h1 preds count: ['class 0 freq 187 perc 17.22%', 'class 1 freq 899 perc 82.78%']
F-measure...... - h0: 0.6364  - h1: 0.5098  - diff: -0.1266
precision...... - h0: 0.7426  - h1: 0.5432  - diff: -0.1994
recall......... - h0: 0.6356  - h1: 0.5268  - diff: -0.1088
accuracy....... - h0: 0.7247  - h1: 0.6188  - diff: -0.1059


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 122.20it/s]



count sample diff f1   is twice tot diff f1....... 999  / 1000    p < 0.999  [38;5;8m![0m
count sample diff prec is twice tot diff prec..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff rec  is twice tot diff rec ..... 999  / 1000    p < 0.999  [38;5;8m![0m
count sample diff acc  is twice tot diff acc...... 999  / 1000    p < 0.999  [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 164 perc 15.10%', 'class 1 freq 922 perc 84.90%']
h1 preds count: ['class 0 freq 524 perc 48.25%', 'class 1 freq 562 perc 51.75%']
F-measure...... - h0: 0.6364  - h1: 0.5528  - diff: -0.0836
precision...... - h0: 0.7426  - h1: 0.5595  - diff: -0.1831
recall......... - h0: 0.6356  - h1: 0.5648  - diff: -0.0708
accuracy....... - h0: 0.7247  - h1: 0.5645  - diff: -0.1602


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 122.36it/s]



count sample diff f1   is twice tot diff f1....... 948  / 1000    p < 0.948  
count sample diff prec is twice tot diff prec..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff rec  is twice tot diff rec ..... 935  / 1000    p < 0.935  
count sample diff acc  is twice tot diff acc...... 1000 / 1000    p < 1.0    [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 164 perc 15.10%', 'class 1 freq 922 perc 84.90%']
h1 preds count: ['class 0 freq 240 perc 22.10%', 'class 1 freq 846 perc 77.90%']
F-measure...... - h0: 0.6364  - h1: 0.7096  - diff: 0.0732
precision...... - h0: 0.7426  - h1: 0.7634  - diff: 0.0208
recall......... - h0: 0.6356  - h1: 0.6977  - diff: 0.0621
accuracy....... - h0: 0.7247  - h1: 0.7615  - diff: 0.0368


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 122.26it/s]



count sample diff f1   is twice tot diff f1....... 58   / 1000    p < 0.058  
count sample diff prec is twice tot diff prec..... 365  / 1000    p < 0.365  
count sample diff rec  is twice tot diff rec ..... 64   / 1000    p < 0.064  
count sample diff acc  is twice tot diff acc...... 165  / 1000    p < 0.165  
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 361 perc 33.24%', 'class 1 freq 725 perc 66.76%']
h1 preds count: ['class 0 freq 190 perc 17.50%', 'class 1 freq 896 perc 82.50%']
F-measure...... - h0: 0.8715  - h1: 0.5008  - diff: -0.3707
precision...... - h0: 0.8783  - h1: 0.5296  - diff: -0.3487
recall......... - h0: 0.8660  - h1: 0.5187  - diff: -0.3473
accuracy....... - h0: 0.8840  - h1: 0.6105  - diff: -0.2735


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 122.36it/s]



count sample diff f1   is twice tot diff f1....... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff prec is twice tot diff prec..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff rec  is twice tot diff rec ..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff acc  is twice tot diff acc...... 1000 / 1000    p < 1.0    [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 361 perc 33.24%', 'class 1 freq 725 perc 66.76%']
h1 preds count: ['class 0 freq 444 perc 40.88%', 'class 1 freq 642 perc 59.12%']
F-measure...... - h0: 0.8715  - h1: 0.5410  - diff: -0.3305
precision...... - h0: 0.8783  - h1: 0.5415  - diff: -0.3368
recall......... - h0: 0.8660  - h1: 0.5437  - diff: -0.3223
accuracy....... - h0: 0.8840  - h1: 0.5663  - diff: -0.3177


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 122.19it/s]



count sample diff f1   is twice tot diff f1....... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff prec is twice tot diff prec..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff rec  is twice tot diff rec ..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff acc  is twice tot diff acc...... 1000 / 1000    p < 1.0    [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 361 perc 33.24%', 'class 1 freq 725 perc 66.76%']
h1 preds count: ['class 0 freq 187 perc 17.22%', 'class 1 freq 899 perc 82.78%']
F-measure...... - h0: 0.8715  - h1: 0.5098  - diff: -0.3617
precision...... - h0: 0.8783  - h1: 0.5432  - diff: -0.3351
recall......... - h0: 0.8660  - h1: 0.5268  - diff: -0.3392
accuracy....... - h0: 0.8840  - h1: 0.6188  - diff: -0.2652


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 122.16it/s]



count sample diff f1   is twice tot diff f1....... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff prec is twice tot diff prec..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff rec  is twice tot diff rec ..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff acc  is twice tot diff acc...... 1000 / 1000    p < 1.0    [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 361 perc 33.24%', 'class 1 freq 725 perc 66.76%']
h1 preds count: ['class 0 freq 524 perc 48.25%', 'class 1 freq 562 perc 51.75%']
F-measure...... - h0: 0.8715  - h1: 0.5528  - diff: -0.3187
precision...... - h0: 0.8783  - h1: 0.5595  - diff: -0.3188
recall......... - h0: 0.8660  - h1: 0.5648  - diff: -0.3012
accuracy....... - h0: 0.8840  - h1: 0.5645  - diff: -0.3195


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 122.03it/s]



count sample diff f1   is twice tot diff f1....... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff prec is twice tot diff prec..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff rec  is twice tot diff rec ..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff acc  is twice tot diff acc...... 1000 / 1000    p < 1.0    [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 361 perc 33.24%', 'class 1 freq 725 perc 66.76%']
h1 preds count: ['class 0 freq 240 perc 22.10%', 'class 1 freq 846 perc 77.90%']
F-measure...... - h0: 0.8715  - h1: 0.7096  - diff: -0.1619
precision...... - h0: 0.8783  - h1: 0.7634  - diff: -0.1149
recall......... - h0: 0.8660  - h1: 0.6977  - diff: -0.1683
accuracy....... - h0: 0.8840  - h1: 0.7615  - diff: -0.1225


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 121.67it/s]



count sample diff f1   is twice tot diff f1....... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff prec is twice tot diff prec..... 995  / 1000    p < 0.995  [38;5;8m![0m
count sample diff rec  is twice tot diff rec ..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff acc  is twice tot diff acc...... 1000 / 1000    p < 1.0    [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 190 perc 17.50%', 'class 1 freq 896 perc 82.50%']
h1 preds count: ['class 0 freq 444 perc 40.88%', 'class 1 freq 642 perc 59.12%']
F-measure...... - h0: 0.5008  - h1: 0.5410  - diff: 0.0402
precision...... - h0: 0.5296  - h1: 0.5415  - diff: 0.0119
recall......... - h0: 0.5187  - h1: 0.5437  - diff: 0.0250
accuracy....... - h0: 0.6105  - h1: 0.5663  - diff: -0.0442


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 122.12it/s]



count sample diff f1   is twice tot diff f1....... 160  / 1000    p < 0.16   
count sample diff prec is twice tot diff prec..... 382  / 1000    p < 0.382  
count sample diff rec  is twice tot diff rec ..... 240  / 1000    p < 0.24   
count sample diff acc  is twice tot diff acc...... 875  / 1000    p < 0.875  
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 190 perc 17.50%', 'class 1 freq 896 perc 82.50%']
h1 preds count: ['class 0 freq 187 perc 17.22%', 'class 1 freq 899 perc 82.78%']
F-measure...... - h0: 0.5008  - h1: 0.5098  - diff: 0.0090
precision...... - h0: 0.5296  - h1: 0.5432  - diff: 0.0136
recall......... - h0: 0.5187  - h1: 0.5268  - diff: 0.0081
accuracy....... - h0: 0.6105  - h1: 0.6188  - diff: 0.0083


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 122.18it/s]



count sample diff f1   is twice tot diff f1....... 396  / 1000    p < 0.396  
count sample diff prec is twice tot diff prec..... 368  / 1000    p < 0.368  
count sample diff rec  is twice tot diff rec ..... 386  / 1000    p < 0.386  
count sample diff acc  is twice tot diff acc...... 359  / 1000    p < 0.359  
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 190 perc 17.50%', 'class 1 freq 896 perc 82.50%']
h1 preds count: ['class 0 freq 524 perc 48.25%', 'class 1 freq 562 perc 51.75%']
F-measure...... - h0: 0.5008  - h1: 0.5528  - diff: 0.0520
precision...... - h0: 0.5296  - h1: 0.5595  - diff: 0.0299
recall......... - h0: 0.5187  - h1: 0.5648  - diff: 0.0461
accuracy....... - h0: 0.6105  - h1: 0.5645  - diff: -0.0460


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 122.06it/s]



count sample diff f1   is twice tot diff f1....... 116  / 1000    p < 0.116  
count sample diff prec is twice tot diff prec..... 274  / 1000    p < 0.274  
count sample diff rec  is twice tot diff rec ..... 120  / 1000    p < 0.12   
count sample diff acc  is twice tot diff acc...... 847  / 1000    p < 0.847  
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 190 perc 17.50%', 'class 1 freq 896 perc 82.50%']
h1 preds count: ['class 0 freq 240 perc 22.10%', 'class 1 freq 846 perc 77.90%']
F-measure...... - h0: 0.5008  - h1: 0.7096  - diff: 0.2088
precision...... - h0: 0.5296  - h1: 0.7634  - diff: 0.2338
recall......... - h0: 0.5187  - h1: 0.6977  - diff: 0.1790
accuracy....... - h0: 0.6105  - h1: 0.7615  - diff: 0.1510


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 121.89it/s]



count sample diff f1   is twice tot diff f1....... 0    / 1000    p < 0.0    [38;5;9m**[0m
count sample diff prec is twice tot diff prec..... 0    / 1000    p < 0.0    [38;5;9m**[0m
count sample diff rec  is twice tot diff rec ..... 0    / 1000    p < 0.0    [38;5;9m**[0m
count sample diff acc  is twice tot diff acc...... 0    / 1000    p < 0.0    [38;5;9m**[0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 444 perc 40.88%', 'class 1 freq 642 perc 59.12%']
h1 preds count: ['class 0 freq 187 perc 17.22%', 'class 1 freq 899 perc 82.78%']
F-measure...... - h0: 0.5410  - h1: 0.5098  - diff: -0.0312
precision...... - h0: 0.5415  - h1: 0.5432  - diff: 0.0017
recall......... - h0: 0.5437  - h1: 0.5268  - diff: -0.0169
accuracy....... - h0: 0.5663  - h1: 0.6188  - diff: 0.0525


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 122.04it/s]



count sample diff f1   is twice tot diff f1....... 756  / 1000    p < 0.756  
count sample diff prec is twice tot diff prec..... 467  / 1000    p < 0.467  
count sample diff rec  is twice tot diff rec ..... 669  / 1000    p < 0.669  
count sample diff acc  is twice tot diff acc...... 79   / 1000    p < 0.079  
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 444 perc 40.88%', 'class 1 freq 642 perc 59.12%']
h1 preds count: ['class 0 freq 524 perc 48.25%', 'class 1 freq 562 perc 51.75%']
F-measure...... - h0: 0.5410  - h1: 0.5528  - diff: 0.0118
precision...... - h0: 0.5415  - h1: 0.5595  - diff: 0.0180
recall......... - h0: 0.5437  - h1: 0.5648  - diff: 0.0211
accuracy....... - h0: 0.5663  - h1: 0.5645  - diff: -0.0018


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 122.11it/s]



count sample diff f1   is twice tot diff f1....... 356  / 1000    p < 0.356  
count sample diff prec is twice tot diff prec..... 287  / 1000    p < 0.287  
count sample diff rec  is twice tot diff rec ..... 270  / 1000    p < 0.27   
count sample diff acc  is twice tot diff acc...... 519  / 1000    p < 0.519  
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 444 perc 40.88%', 'class 1 freq 642 perc 59.12%']
h1 preds count: ['class 0 freq 240 perc 22.10%', 'class 1 freq 846 perc 77.90%']
F-measure...... - h0: 0.5410  - h1: 0.7096  - diff: 0.1686
precision...... - h0: 0.5415  - h1: 0.7634  - diff: 0.2219
recall......... - h0: 0.5437  - h1: 0.6977  - diff: 0.1540
accuracy....... - h0: 0.5663  - h1: 0.7615  - diff: 0.1952


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 121.58it/s]



count sample diff f1   is twice tot diff f1....... 0    / 1000    p < 0.0    [38;5;9m**[0m
count sample diff prec is twice tot diff prec..... 0    / 1000    p < 0.0    [38;5;9m**[0m
count sample diff rec  is twice tot diff rec ..... 0    / 1000    p < 0.0    [38;5;9m**[0m
count sample diff acc  is twice tot diff acc...... 0    / 1000    p < 0.0    [38;5;9m**[0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 187 perc 17.22%', 'class 1 freq 899 perc 82.78%']
h1 preds count: ['class 0 freq 524 perc 48.25%', 'class 1 freq 562 perc 51.75%']
F-measure...... - h0: 0.5098  - h1: 0.5528  - diff: 0.0430
precision...... - h0: 0.5432  - h1: 0.5595  - diff: 0.0163
recall......... - h0: 0.5268  - h1: 0.5648  - diff: 0.0380
accuracy....... - h0: 0.6188  - h1: 0.5645  - diff: -0.0543


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 121.75it/s]



count sample diff f1   is twice tot diff f1....... 149  / 1000    p < 0.149  
count sample diff prec is twice tot diff prec..... 371  / 1000    p < 0.371  
count sample diff rec  is twice tot diff rec ..... 156  / 1000    p < 0.156  
count sample diff acc  is twice tot diff acc...... 899  / 1000    p < 0.899  
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 187 perc 17.22%', 'class 1 freq 899 perc 82.78%']
h1 preds count: ['class 0 freq 240 perc 22.10%', 'class 1 freq 846 perc 77.90%']
F-measure...... - h0: 0.5098  - h1: 0.7096  - diff: 0.1998
precision...... - h0: 0.5432  - h1: 0.7634  - diff: 0.2202
recall......... - h0: 0.5268  - h1: 0.6977  - diff: 0.1709
accuracy....... - h0: 0.6188  - h1: 0.7615  - diff: 0.1427


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 122.01it/s]



count sample diff f1   is twice tot diff f1....... 0    / 1000    p < 0.0    [38;5;9m**[0m
count sample diff prec is twice tot diff prec..... 0    / 1000    p < 0.0    [38;5;9m**[0m
count sample diff rec  is twice tot diff rec ..... 0    / 1000    p < 0.0    [38;5;9m**[0m
count sample diff acc  is twice tot diff acc...... 0    / 1000    p < 0.0    [38;5;9m**[0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 387 perc 35.64%', 'class 1 freq 699 perc 64.36%']
h0 preds count: ['class 0 freq 524 perc 48.25%', 'class 1 freq 562 perc 51.75%']
h1 preds count: ['class 0 freq 240 perc 22.10%', 'class 1 freq 846 perc 77.90%']
F-measure...... - h0: 0.5528  - h1: 0.7096  - diff: 0.1568
precision...... - h0: 0.5595  - h1: 0.7634  - diff: 0.2039
recall......... - h0: 0.5648  - h1: 0.6977  - diff: 0.1329
accuracy....... - h0: 0.5645  - h1: 0.7615  - diff: 0.1970


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 121.81it/s]


count sample diff f1   is twice tot diff f1....... 2    / 1000    p < 0.002  [38;5;9m**[0m
count sample diff prec is twice tot diff prec..... 0    / 1000    p < 0.0    [38;5;9m**[0m
count sample diff rec  is twice tot diff rec ..... 5    / 1000    p < 0.005  [38;5;9m**[0m
count sample diff acc  is twice tot diff acc...... 0    / 1000    p < 0.0    [38;5;9m**[0m





In [146]:
output.to_csv('F_score_diff_for_gender.csv')

In [145]:
output

Unnamed: 0,M3_bio_male,M3_bio_picture_male,XLM_bio_male,XLM_bio_picture_male,flan_bio_male,faln_bio_tw_male,gpt_bio_male,gpt_bio_tw_male,cv_male
M3_bio_male,0.5051,0.3063**,0.1313**,0.3664**,-0.0043,0.0359,0.0047,0.0477,0.2045**
M3_bio_picture_male,,0.8114,-0.175!,0.0601,-0.3106!,-0.2704!,-0.3016!,-0.2586!,-0.1018!
XLM_bio_male,,,0.6364,0.2351**,-0.1356!,-0.0954!,-0.1266!,-0.0836,0.0732
XLM_bio_picture_male,,,,0.8715,-0.3707!,-0.3305!,-0.3617!,-0.3187!,-0.1619!
flan_bio_male,,,,,0.5008,0.0402,0.009,0.052,0.2088**
faln_bio_tw_male,,,,,,0.541,-0.0312,0.0118,0.1686**
gpt_bio_male,,,,,,,0.5098,0.043,0.1998**
gpt_bio_tw_male,,,,,,,,0.5528,0.1568**
cv_male,,,,,,,,,0.7096


In [None]:
#### in output I added difference in Fscore and signif level
#### in package signif levels are difined so:  Where you see two stars ** you have a significance with p≤.01
#### one star * indicates siginficance with p≤.05

In [149]:
row_names = models
column_names = models
output = pd.DataFrame(index=row_names, columns=column_names)

In [151]:
### the same for ACCURACY 
for i in model_combinations:
    file_name = f'{i[0]}_vs_{i[1]}'
    file_path = 'gender_results/'+file_name
    new_file_path = file_path+'/results.tsv'
    results = pd.read_csv(new_file_path, sep='\t')
    output.loc[i[0],i[0]] = str(results['acc'][0])
    output.loc[i[1],i[1]] = str(results['acc'][1])

output

Unnamed: 0,M3_bio_male,M3_bio_picture_male,XLM_bio_male,XLM_bio_picture_male,flan_bio_male,faln_bio_tw_male,gpt_bio_male,gpt_bio_tw_male,cv_male
M3_bio_male,0.6759,,,,,,,,
M3_bio_picture_male,,0.8435,,,,,,,
XLM_bio_male,,,0.7247,,,,,,
XLM_bio_picture_male,,,,0.884,,,,,
flan_bio_male,,,,,0.6105,,,,
faln_bio_tw_male,,,,,,0.5663,,,
gpt_bio_male,,,,,,,0.6188,,
gpt_bio_tw_male,,,,,,,,0.5645,
cv_male,,,,,,,,,0.7615


# ITALIAN, AGE

In [159]:
### downloading data it
path = '/g100_work/IscrC_mental/data/user_classification/trained_models/it/'

# gold labels
file = 'gold_labels_test.pkl'
with open(path+file, 'rb') as file:
    gold_labels_it = pickle.load(file)

# M3 bio
file = 'm3_test_age_gender.csv'
m3_bio_it = pd.read_csv(path+file)
    
# M3 bio + pictures
# m3_bio_picture_it = pd.read_pickle('/g100_work/IscrC_mental/data/user_classification/m3_scores_bio_image.pkl')
file = 'm3_bio_image_test_age_gender.csv'
m3_bio_picture_it = pd.read_csv(path+file)


# XLM bio
XLM_bio_gender_it = pd.read_csv('/g100_work/IscrC_mental/data/user_classification/trained_models/it/xlm_bio_only_test_gender.csv')
XLM_bio_age_it = pd.read_csv('/g100_work/IscrC_mental/data/user_classification/trained_models/it/xlm_bio_only_test_age.csv')

# XLM bios  + pictures

XLM_bio_picture_age_it = pd.DataFrame(np.load('/g100_work/IscrC_mental/data/user_classification/trained_models/age/XLM_probs_age_test.npy'))
XLM_bio_picture_age_it.columns = ['user_id', 'pred_age_0_19_prob', 'pred_age_20_29_prob', 'pred_age_30_39_prob', 'pred_age_40_100_prob']
XLM_bio_picture_age_it.user_id = XLM_bio_picture_age_it.user_id.astype(float) 

XLM_bio_picture_gender_it = pd.DataFrame(np.load('/g100_work/IscrC_mental/data/user_classification/trained_models/gender/XLM_probs_gender_test.npy'))
XLM_bio_picture_gender_it.columns = ['user_id', 'p_is_female', 'p_is_male']
XLM_bio_picture_gender_it.user_id = XLM_bio_picture_age_it.user_id.astype(float) 


# FLAN-T5 bio
file = 'flan_bio_only_test_gender.csv'
flan_bio_gender = pd.read_csv(path+file)

file = 'flan_bio_only_test_age.csv'
flan_bio_age = pd.read_csv(path+file)

# FLAN-T5 bio + tweets

file = 'flan_bio_tweets_test_gender.csv'
flan_bio_tweets_gender = pd.read_csv(path+file)

file = 'flan_bio_tweets_test_age.csv'
flan_bio_tweets_age =  pd.read_csv(path+file)


# GPT 3.5 bio

file = 'gpt35_bio_test_age.csv'
gpt_bio_age = pd.read_csv(path+file)

file = 'gpt35_bio_test_gender.csv'
gpt_bio_gender = pd.read_csv(path+file)


# GPT 3.5 bio + picture

file =  'gpt35_bio_tweets_test_age.csv'
gpt_bio_tweets_age = pd.read_csv(path+file)

file = 'gpt35_bio_tweets_test_gender.csv'
gpt_bio_tweets_gender = pd.read_csv(path+file)


# CV how = 'outer'
file = 'cv_test.pkl'
cv_it = pd.read_pickle(path+file)


In [161]:
m3_bio_picture_it

Unnamed: 0,user_id,male,age_class,pred_male,pred_age_class
0,7.071362e+06,1,3,1.0,3.0
1,9.420092e+06,0,2,0.0,2.0
2,1.174941e+07,0,2,1.0,2.0
3,1.408858e+07,0,3,0.0,3.0
4,1.428183e+07,1,3,1.0,1.0
...,...,...,...,...,...
1114,1.492599e+18,1,1,1.0,3.0
1115,1.500578e+18,0,2,0.0,2.0
1116,1.511727e+18,1,3,1.0,3.0
1117,1.578278e+18,0,1,0.0,0.0


In [None]:
#### inner for results calculating

In [162]:
gold_labels_it['user_id'] = gold_labels_it['user_id'].astype(float)
age_it =  gold_labels_it[['user_id','gold_age']]
print(len(age_it))

m3_bio_it['user_id'] = m3_bio_it['user_id'].astype(float)
age_it = age_it.merge(m3_bio_it[['user_id','pred_age_class']], on='user_id')
age_it.rename(columns={'pred_age_class': 'M3_bio_age'}, inplace=True)
print(len(age_it))

m3_bio_picture_it['user_id'] = m3_bio_picture_it['user_id'].astype(float)
m3_bio_picture_it['pred_age_class'] = m3_bio_picture_it['pred_age_class'].astype(int)
age_it = age_it.merge(m3_bio_picture_it[['user_id', 'pred_age_class']], on = 'user_id')
age_it.rename(columns={'pred_age_class': 'M3_bio_picture_age'}, inplace=True)
print(len(age_it))


age_it = age_it.merge(XLM_bio_age_it[['user_id', 'pred_age_class']], on = 'user_id')
age_it.rename(columns={'pred_age_class': 'XLM_bio_age'}, inplace=True)
print(len(age_it))


age_category_mapping = {
    'pred_age_0_19_prob': 0,
    'pred_age_20_29_prob': 1,
    'pred_age_30_39_prob': 2,
    'pred_age_40_100_prob': 3
}
max_age_category = XLM_bio_picture_age_it[['pred_age_0_19_prob', 'pred_age_20_29_prob', 'pred_age_30_39_prob', 'pred_age_40_100_prob']].idxmax(axis=1)
XLM_bio_picture_age_it['XLM_bio_picture_age'] = max_age_category.map(age_category_mapping)
age_it = age_it.merge(XLM_bio_picture_age_it[['user_id', 'XLM_bio_picture_age']], on = 'user_id')
print(len(age_it))

age_it = age_it.merge(flan_bio_age[['user_id','prediction']], on='user_id')
age_it.rename(columns={'prediction': 'flan_bio_age'}, inplace=True)
print(len(age_it))

age_it = age_it.merge(flan_bio_tweets_age[['user_id','prediction']], on='user_id')
age_it.rename(columns={'prediction': 'flan_bio_tweets_age'}, inplace=True)
print(len(age_it))

age_it = age_it.merge(gpt_bio_age[['user_id','prediction']], on='user_id')
age_it.rename(columns={'prediction': 'gpt_bio_age'}, inplace=True)
print(len(age_it))

age_it = age_it.merge(gpt_bio_tweets_age[['user_id','prediction']], on='user_id')
age_it.rename(columns={'prediction': 'gpt_bio_tweets_age'}, inplace=True)
print(len(age_it))


cv_it['user_id'] = cv_it['user_id'].astype(float)
age_it = age_it.merge(cv_it[['user_id', 'pred_age_label_id']], on = 'user_id')
age_it.rename(columns={'pred_age_label_id': 'CV_bio_age'}, inplace=True)
print(len(age_it))

1119
1119
1119
1087
1087
1086
1086
1086
1086
1086


In [163]:
column_names = age_it.columns.tolist()
columns_to_drop = ['user_id', 'gold_age']
models = [col for col in column_names if col not in columns_to_drop]
models

['M3_bio_age',
 'M3_bio_picture_age',
 'XLM_bio_age',
 'XLM_bio_picture_age',
 'flan_bio_age',
 'flan_bio_tweets_age',
 'gpt_bio_age',
 'gpt_bio_tweets_age',
 'CV_bio_age']

In [164]:
import itertools
model_combinations = list(itertools.combinations(models, 2))
for combo in model_combinations:
    print(combo)

('M3_bio_age', 'M3_bio_picture_age')
('M3_bio_age', 'XLM_bio_age')
('M3_bio_age', 'XLM_bio_picture_age')
('M3_bio_age', 'flan_bio_age')
('M3_bio_age', 'flan_bio_tweets_age')
('M3_bio_age', 'gpt_bio_age')
('M3_bio_age', 'gpt_bio_tweets_age')
('M3_bio_age', 'CV_bio_age')
('M3_bio_picture_age', 'XLM_bio_age')
('M3_bio_picture_age', 'XLM_bio_picture_age')
('M3_bio_picture_age', 'flan_bio_age')
('M3_bio_picture_age', 'flan_bio_tweets_age')
('M3_bio_picture_age', 'gpt_bio_age')
('M3_bio_picture_age', 'gpt_bio_tweets_age')
('M3_bio_picture_age', 'CV_bio_age')
('XLM_bio_age', 'XLM_bio_picture_age')
('XLM_bio_age', 'flan_bio_age')
('XLM_bio_age', 'flan_bio_tweets_age')
('XLM_bio_age', 'gpt_bio_age')
('XLM_bio_age', 'gpt_bio_tweets_age')
('XLM_bio_age', 'CV_bio_age')
('XLM_bio_picture_age', 'flan_bio_age')
('XLM_bio_picture_age', 'flan_bio_tweets_age')
('XLM_bio_picture_age', 'gpt_bio_age')
('XLM_bio_picture_age', 'gpt_bio_tweets_age')
('XLM_bio_picture_age', 'CV_bio_age')
('flan_bio_age', 'flan

In [165]:
row_names = models
column_names = models
output = pd.DataFrame(index=row_names, columns=column_names)

In [168]:
os.mkdir('age_results')
boot = Bootstrap()

for i in model_combinations:
    file_name = f'{i[0]}_vs_{i[1]}'
    file_path = 'age_results/'+file_name
    os.mkdir(file_path)
    boot = Bootstrap(save_outcomes=True, dir_out=file_path)
    boot.test(targs=age_it['gold_age'].tolist(), h0_preds=age_it[i[0]].tolist(), h1_preds=age_it[i[1]].tolist(), n_loops=1000, sample_size=.2, verbose=True)
    
    new_file_path = file_path+'/results.tsv'
    results = pd.read_csv(new_file_path, sep='\t')
    
    diff = "" if math.isnan(results['d_f1'][1]) else str(results['d_f1'][1])
    signif = '' if pd.isna(results['s_f1'][1]) else str(results['s_f1'][1])
    output.loc[i[0],i[1]] = diff + signif
    ## adding F-scores
    output.loc[i[0],i[0]] = str(results['f1'][0])
    output.loc[i[1],i[1]] = str(results['f1'][1])

data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 830 perc 76.43%', 'class 1 freq 62 perc 5.71%', 'class 2 freq 130 perc 11.97%', 'class 3 freq 64 perc 5.89%']
h1 preds count: ['class 0 freq 123 perc 11.33%', 'class 1 freq 140 perc 12.89%', 'class 2 freq 171 perc 15.75%', 'class 3 freq 652 perc 60.04%']
F-measure...... - h0: 0.1763  - h1: 0.4272  - diff: 0.2509
precision...... - h0: 0.3379  - h1: 0.4410  - diff: 0.1031
recall......... - h0: 0.3120  - h1: 0.4265  - diff: 0.1145
accuracy....... - h0: 0.1786  - h1: 0.5405  - diff: 0.3619


bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 123.58it/s]



count sample diff f1   is twice tot diff f1....... 0    / 1000    p < 0.0    [38;5;9m**[0m
count sample diff prec is twice tot diff prec..... 52   / 1000    p < 0.052  
count sample diff rec  is twice tot diff rec ..... 3    / 1000    p < 0.003  [38;5;9m**[0m
count sample diff acc  is twice tot diff acc...... 0    / 1000    p < 0.0    [38;5;9m**[0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 830 perc 76.43%', 'class 1 freq 62 perc 5.71%', 'class 2 freq 130 perc 11.97%', 'class 3 freq 64 perc 5.89%']
h1 preds count: ['class 0 freq 23 perc 2.12%', 'class 1 freq 152 perc 14.00%', 'class 2 freq 116 perc 10.68%', 'class 3 freq 795 perc 73.20%']
F-measure...... - h0: 0.1763  - h1: 0.3389  - diff: 0.1626
precision...... - h0: 0.3379  - h1: 0.3981  - diff: 0.0602
recall......... - h0: 0.3120  - h1: 0.34

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 124.91it/s]



count sample diff f1   is twice tot diff f1....... 0    / 1000    p < 0.0    [38;5;9m**[0m
count sample diff prec is twice tot diff prec..... 240  / 1000    p < 0.24   
count sample diff rec  is twice tot diff rec ..... 218  / 1000    p < 0.218  
count sample diff acc  is twice tot diff acc...... 0    / 1000    p < 0.0    [38;5;9m**[0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 830 perc 76.43%', 'class 1 freq 62 perc 5.71%', 'class 2 freq 130 perc 11.97%', 'class 3 freq 64 perc 5.89%']
h1 preds count: ['class 0 freq 64 perc 5.89%', 'class 1 freq 238 perc 21.92%', 'class 2 freq 216 perc 19.89%', 'class 3 freq 568 perc 52.30%']
F-measure...... - h0: 0.1763  - h1: 0.5986  - diff: 0.4223
precision...... - h0: 0.3379  - h1: 0.6389  - diff: 0.3010
recall......... - h0: 0.3120  - h1: 0.5786  - diff: 0.2

bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 124.74it/s]



count sample diff f1   is twice tot diff f1....... 0    / 1000    p < 0.0    [38;5;9m**[0m
count sample diff prec is twice tot diff prec..... 0    / 1000    p < 0.0    [38;5;9m**[0m
count sample diff rec  is twice tot diff rec ..... 0    / 1000    p < 0.0    [38;5;9m**[0m
count sample diff acc  is twice tot diff acc...... 0    / 1000    p < 0.0    [38;5;9m**[0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 830 perc 76.43%', 'class 1 freq 62 perc 5.71%', 'class 2 freq 130 perc 11.97%', 'class 3 freq 64 perc 5.89%']
h1 preds count: ['class 0 freq 812 perc 74.77%', 'class 1 freq 173 perc 15.93%', 'class 2 freq 44 perc 4.05%', 'class 3 freq 57 perc 5.25%']
F-measure...... - h0: 0.1763  - h1: 0.1454  - diff: -0.0309
precision...... - h0: 0.3379  - h1: 0.2867  - diff: -0.0512
recall......... - h0: 0.3

  _warn_prf(average, modifier, msg_start, len(result))
bootstrap: 100%|███████████████████████████| 1000/1000 [00:07<00:00, 125.15it/s]



count sample diff f1   is twice tot diff f1....... 789  / 1000    p < 0.789  
count sample diff prec is twice tot diff prec..... 746  / 1000    p < 0.746  
count sample diff rec  is twice tot diff rec ..... 836  / 1000    p < 0.836  
count sample diff acc  is twice tot diff acc...... 792  / 1000    p < 0.792  
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 830 perc 76.43%', 'class 1 freq 62 perc 5.71%', 'class 2 freq 130 perc 11.97%', 'class 3 freq 64 perc 5.89%']
h1 preds count: ['class 0 freq 7 perc 0.64%', 'class 1 freq 179 perc 16.48%', 'class 2 freq 11 perc 1.01%', 'class 3 freq 889 perc 81.86%']
F-measure...... - h0: 0.1763  - h1: 0.2621  - diff: 0.0858
precision...... - h0: 0.3379  - h1: 0.4440  - diff: 0.1061
recall......... - h0: 0.3120  - h1: 0.2947  - diff: -0.0173
accuracy....... - h0: 0.1786

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr


count sample diff f1   is twice tot diff f1....... 26   / 1000    p < 0.026  [38;5;9m*[0m
count sample diff prec is twice tot diff prec..... 168  / 1000    p < 0.168  
count sample diff rec  is twice tot diff rec ..... 708  / 1000    p < 0.708  
count sample diff acc  is twice tot diff acc...... 0    / 1000    p < 0.0    [38;5;9m**[0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 830 perc 76.43%', 'class 1 freq 62 perc 5.71%', 'class 2 freq 130 perc 11.97%', 'class 3 freq 64 perc 5.89%']
h1 preds count: ['class 0 freq 611 perc 56.26%', 'class 1 freq 248 perc 22.84%', 'class 2 freq 197 perc 18.14%', 'class 3 freq 30 perc 2.76%']
F-measure...... - h0: 0.1763  - h1: 0.1753  - diff: -0.0010
precision...... - h0: 0.3379  - h1: 0.2810  - diff: -0.0569
recall......... - h0: 0.3120  - h1: 0.2852  - diff: -0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 124.73it/s]



count sample diff f1   is twice tot diff f1....... 507  / 1000    p < 0.507  
count sample diff prec is twice tot diff prec..... 763  / 1000    p < 0.763  
count sample diff rec  is twice tot diff rec ..... 713  / 1000    p < 0.713  
count sample diff acc  is twice tot diff acc...... 590  / 1000    p < 0.59   
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 830 perc 76.43%', 'class 1 freq 62 perc 5.71%', 'class 2 freq 130 perc 11.97%', 'class 3 freq 64 perc 5.89%']
h1 preds count: ['class 0 freq 271 perc 24.95%', 'class 1 freq 387 perc 35.64%', 'class 2 freq 287 perc 26.43%', 'class 3 freq 141 perc 12.98%']
F-measure...... - h0: 0.1763  - h1: 0.2103  - diff: 0.0340
precision...... - h0: 0.3379  - h1: 0.2541  - diff: -0.0838
recall......... - h0: 0.3120  - h1: 0.2563  - diff: -0.0557
accuracy....... - h0: 

bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 124.15it/s]



count sample diff f1   is twice tot diff f1....... 170  / 1000    p < 0.17   
count sample diff prec is twice tot diff prec..... 913  / 1000    p < 0.913  
count sample diff rec  is twice tot diff rec ..... 892  / 1000    p < 0.892  
count sample diff acc  is twice tot diff acc...... 152  / 1000    p < 0.152  
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 830 perc 76.43%', 'class 1 freq 62 perc 5.71%', 'class 2 freq 130 perc 11.97%', 'class 3 freq 64 perc 5.89%']
h1 preds count: ['class 0 freq 256 perc 23.57%', 'class 1 freq 372 perc 34.25%', 'class 2 freq 86 perc 7.92%', 'class 3 freq 372 perc 34.25%']
F-measure...... - h0: 0.1763  - h1: 0.2796  - diff: 0.1033
precision...... - h0: 0.3379  - h1: 0.2935  - diff: -0.0444
recall......... - h0: 0.3120  - h1: 0.3191  - diff: 0.0071
accuracy....... - h0: 0.1

bootstrap: 100%|███████████████████████████| 1000/1000 [00:07<00:00, 125.17it/s]



count sample diff f1   is twice tot diff f1....... 3    / 1000    p < 0.003  [38;5;9m**[0m
count sample diff prec is twice tot diff prec..... 749  / 1000    p < 0.749  
count sample diff rec  is twice tot diff rec ..... 418  / 1000    p < 0.418  
count sample diff acc  is twice tot diff acc...... 0    / 1000    p < 0.0    [38;5;9m**[0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 123 perc 11.33%', 'class 1 freq 140 perc 12.89%', 'class 2 freq 171 perc 15.75%', 'class 3 freq 652 perc 60.04%']
h1 preds count: ['class 0 freq 23 perc 2.12%', 'class 1 freq 152 perc 14.00%', 'class 2 freq 116 perc 10.68%', 'class 3 freq 795 perc 73.20%']
F-measure...... - h0: 0.4272  - h1: 0.3389  - diff: -0.0883
precision...... - h0: 0.4410  - h1: 0.3981  - diff: -0.0429
recall......... - h0: 0.4265  - h1: 0.3403  - dif

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 124.85it/s]



count sample diff f1   is twice tot diff f1....... 969  / 1000    p < 0.969  [38;5;8m![0m
count sample diff prec is twice tot diff prec..... 709  / 1000    p < 0.709  
count sample diff rec  is twice tot diff rec ..... 975  / 1000    p < 0.975  [38;5;8m![0m
count sample diff acc  is twice tot diff acc...... 749  / 1000    p < 0.749  
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 123 perc 11.33%', 'class 1 freq 140 perc 12.89%', 'class 2 freq 171 perc 15.75%', 'class 3 freq 652 perc 60.04%']
h1 preds count: ['class 0 freq 64 perc 5.89%', 'class 1 freq 238 perc 21.92%', 'class 2 freq 216 perc 19.89%', 'class 3 freq 568 perc 52.30%']
F-measure...... - h0: 0.4272  - h1: 0.5986  - diff: 0.1714
precision...... - h0: 0.4410  - h1: 0.6389  - diff: 0.1979
recall......... - h0: 0.4265  - h1: 0.5786  - diff: 0

bootstrap: 100%|███████████████████████████| 1000/1000 [00:07<00:00, 125.10it/s]



count sample diff f1   is twice tot diff f1....... 1    / 1000    p < 0.001  [38;5;9m**[0m
count sample diff prec is twice tot diff prec..... 0    / 1000    p < 0.0    [38;5;9m**[0m
count sample diff rec  is twice tot diff rec ..... 2    / 1000    p < 0.002  [38;5;9m**[0m
count sample diff acc  is twice tot diff acc...... 2    / 1000    p < 0.002  [38;5;9m**[0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 123 perc 11.33%', 'class 1 freq 140 perc 12.89%', 'class 2 freq 171 perc 15.75%', 'class 3 freq 652 perc 60.04%']
h1 preds count: ['class 0 freq 812 perc 74.77%', 'class 1 freq 173 perc 15.93%', 'class 2 freq 44 perc 4.05%', 'class 3 freq 57 perc 5.25%']
F-measure...... - h0: 0.4272  - h1: 0.1454  - diff: -0.2818
precision...... - h0: 0.4410  - h1: 0.2867  - diff: -0.1543
recall......... - h0:

bootstrap: 100%|███████████████████████████| 1000/1000 [00:07<00:00, 125.18it/s]



count sample diff f1   is twice tot diff f1....... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff prec is twice tot diff prec..... 988  / 1000    p < 0.988  [38;5;8m![0m
count sample diff rec  is twice tot diff rec ..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff acc  is twice tot diff acc...... 1000 / 1000    p < 1.0    [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 123 perc 11.33%', 'class 1 freq 140 perc 12.89%', 'class 2 freq 171 perc 15.75%', 'class 3 freq 652 perc 60.04%']
h1 preds count: ['class 0 freq 7 perc 0.64%', 'class 1 freq 179 perc 16.48%', 'class 2 freq 11 perc 1.01%', 'class 3 freq 889 perc 81.86%']
F-measure...... - h0: 0.4272  - h1: 0.2621  - diff: -0.1651
precision...... - h0: 0.4410  - h1: 0.4440  - diff: 0.0030
recall......... - h0: 0.426

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr


count sample diff f1   is twice tot diff f1....... 999  / 1000    p < 0.999  [38;5;8m![0m
count sample diff prec is twice tot diff prec..... 393  / 1000    p < 0.393  
count sample diff rec  is twice tot diff rec ..... 999  / 1000    p < 0.999  [38;5;8m![0m
count sample diff acc  is twice tot diff acc...... 847  / 1000    p < 0.847  
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 123 perc 11.33%', 'class 1 freq 140 perc 12.89%', 'class 2 freq 171 perc 15.75%', 'class 3 freq 652 perc 60.04%']
h1 preds count: ['class 0 freq 611 perc 56.26%', 'class 1 freq 248 perc 22.84%', 'class 2 freq 197 perc 18.14%', 'class 3 freq 30 perc 2.76%']
F-measure...... - h0: 0.4272  - h1: 0.1753  - diff: -0.2519
precision...... - h0: 0.4410  - h1: 0.2810  - diff: -0.1600
recall......... - h0: 0.4265  - h1: 0.2852  - diff:

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 124.59it/s]



count sample diff f1   is twice tot diff f1....... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff prec is twice tot diff prec..... 975  / 1000    p < 0.975  [38;5;8m![0m
count sample diff rec  is twice tot diff rec ..... 996  / 1000    p < 0.996  [38;5;8m![0m
count sample diff acc  is twice tot diff acc...... 1000 / 1000    p < 1.0    [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 123 perc 11.33%', 'class 1 freq 140 perc 12.89%', 'class 2 freq 171 perc 15.75%', 'class 3 freq 652 perc 60.04%']
h1 preds count: ['class 0 freq 271 perc 24.95%', 'class 1 freq 387 perc 35.64%', 'class 2 freq 287 perc 26.43%', 'class 3 freq 141 perc 12.98%']
F-measure...... - h0: 0.4272  - h1: 0.2103  - diff: -0.2169
precision...... - h0: 0.4410  - h1: 0.2541  - diff: -0.1869
recall......... - h0:

bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 124.95it/s]



count sample diff f1   is twice tot diff f1....... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff prec is twice tot diff prec..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff rec  is twice tot diff rec ..... 998  / 1000    p < 0.998  [38;5;8m![0m
count sample diff acc  is twice tot diff acc...... 1000 / 1000    p < 1.0    [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 123 perc 11.33%', 'class 1 freq 140 perc 12.89%', 'class 2 freq 171 perc 15.75%', 'class 3 freq 652 perc 60.04%']
h1 preds count: ['class 0 freq 256 perc 23.57%', 'class 1 freq 372 perc 34.25%', 'class 2 freq 86 perc 7.92%', 'class 3 freq 372 perc 34.25%']
F-measure...... - h0: 0.4272  - h1: 0.2796  - diff: -0.1476
precision...... - h0: 0.4410  - h1: 0.2935  - diff: -0.1475
recall......... - h0: 0

bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 124.53it/s]



count sample diff f1   is twice tot diff f1....... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff prec is twice tot diff prec..... 999  / 1000    p < 0.999  [38;5;8m![0m
count sample diff rec  is twice tot diff rec ..... 987  / 1000    p < 0.987  [38;5;8m![0m
count sample diff acc  is twice tot diff acc...... 1000 / 1000    p < 1.0    [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 23 perc 2.12%', 'class 1 freq 152 perc 14.00%', 'class 2 freq 116 perc 10.68%', 'class 3 freq 795 perc 73.20%']
h1 preds count: ['class 0 freq 64 perc 5.89%', 'class 1 freq 238 perc 21.92%', 'class 2 freq 216 perc 19.89%', 'class 3 freq 568 perc 52.30%']
F-measure...... - h0: 0.3389  - h1: 0.5986  - diff: 0.2597
precision...... - h0: 0.3981  - h1: 0.6389  - diff: 0.2408
recall......... - h0: 0.340

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 124.67it/s]



count sample diff f1   is twice tot diff f1....... 0    / 1000    p < 0.0    [38;5;9m**[0m
count sample diff prec is twice tot diff prec..... 0    / 1000    p < 0.0    [38;5;9m**[0m
count sample diff rec  is twice tot diff rec ..... 0    / 1000    p < 0.0    [38;5;9m**[0m
count sample diff acc  is twice tot diff acc...... 0    / 1000    p < 0.0    [38;5;9m**[0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 23 perc 2.12%', 'class 1 freq 152 perc 14.00%', 'class 2 freq 116 perc 10.68%', 'class 3 freq 795 perc 73.20%']
h1 preds count: ['class 0 freq 812 perc 74.77%', 'class 1 freq 173 perc 15.93%', 'class 2 freq 44 perc 4.05%', 'class 3 freq 57 perc 5.25%']
F-measure...... - h0: 0.3389  - h1: 0.1454  - diff: -0.1935
precision...... - h0: 0.3981  - h1: 0.2867  - diff: -0.1114
recall......... - h0: 0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
bootstrap: 100%|███████████████████████████| 1000/1000 [00:07<00:00, 125.81it/s]



count sample diff f1   is twice tot diff f1....... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff prec is twice tot diff prec..... 908  / 1000    p < 0.908  
count sample diff rec  is twice tot diff rec ..... 953  / 1000    p < 0.953  [38;5;8m![0m
count sample diff acc  is twice tot diff acc...... 1000 / 1000    p < 1.0    [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 23 perc 2.12%', 'class 1 freq 152 perc 14.00%', 'class 2 freq 116 perc 10.68%', 'class 3 freq 795 perc 73.20%']
h1 preds count: ['class 0 freq 7 perc 0.64%', 'class 1 freq 179 perc 16.48%', 'class 2 freq 11 perc 1.01%', 'class 3 freq 889 perc 81.86%']
F-measure...... - h0: 0.3389  - h1: 0.2621  - diff: -0.0768
precision...... - h0: 0.3981  - h1: 0.4440  - diff: 0.0459
recall......... - h0: 0.3403  - h1: 0.2947 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr


count sample diff f1   is twice tot diff f1....... 956  / 1000    p < 0.956  [38;5;8m![0m
count sample diff prec is twice tot diff prec..... 280  / 1000    p < 0.28   
count sample diff rec  is twice tot diff rec ..... 907  / 1000    p < 0.907  
count sample diff acc  is twice tot diff acc...... 695  / 1000    p < 0.695  
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 23 perc 2.12%', 'class 1 freq 152 perc 14.00%', 'class 2 freq 116 perc 10.68%', 'class 3 freq 795 perc 73.20%']
h1 preds count: ['class 0 freq 611 perc 56.26%', 'class 1 freq 248 perc 22.84%', 'class 2 freq 197 perc 18.14%', 'class 3 freq 30 perc 2.76%']
F-measure...... - h0: 0.3389  - h1: 0.1753  - diff: -0.1636
precision...... - h0: 0.3981  - h1: 0.2810  - diff: -0.1171
recall......... - h0: 0.3403  - h1: 0.2852  - diff: -0.0551
accurac

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
bootstrap: 100%|███████████████████████████| 1000/1000 [00:07<00:00, 125.48it/s]



count sample diff f1   is twice tot diff f1....... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff prec is twice tot diff prec..... 899  / 1000    p < 0.899  
count sample diff rec  is twice tot diff rec ..... 899  / 1000    p < 0.899  
count sample diff acc  is twice tot diff acc...... 1000 / 1000    p < 1.0    [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 23 perc 2.12%', 'class 1 freq 152 perc 14.00%', 'class 2 freq 116 perc 10.68%', 'class 3 freq 795 perc 73.20%']
h1 preds count: ['class 0 freq 271 perc 24.95%', 'class 1 freq 387 perc 35.64%', 'class 2 freq 287 perc 26.43%', 'class 3 freq 141 perc 12.98%']
F-measure...... - h0: 0.3389  - h1: 0.2103  - diff: -0.1286
precision...... - h0: 0.3981  - h1: 0.2541  - diff: -0.1440
recall......... - h0: 0.3403  - h1: 0.2563  - diff:

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 124.31it/s]



count sample diff f1   is twice tot diff f1....... 998  / 1000    p < 0.998  [38;5;8m![0m
count sample diff prec is twice tot diff prec..... 973  / 1000    p < 0.973  [38;5;8m![0m
count sample diff rec  is twice tot diff rec ..... 966  / 1000    p < 0.966  [38;5;8m![0m
count sample diff acc  is twice tot diff acc...... 1000 / 1000    p < 1.0    [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 23 perc 2.12%', 'class 1 freq 152 perc 14.00%', 'class 2 freq 116 perc 10.68%', 'class 3 freq 795 perc 73.20%']
h1 preds count: ['class 0 freq 256 perc 23.57%', 'class 1 freq 372 perc 34.25%', 'class 2 freq 86 perc 7.92%', 'class 3 freq 372 perc 34.25%']
F-measure...... - h0: 0.3389  - h1: 0.2796  - diff: -0.0593
precision...... - h0: 0.3981  - h1: 0.2935  - diff: -0.1046
recall......... - h0: 0.3

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
bootstrap: 100%|███████████████████████████| 1000/1000 [00:07<00:00, 125.15it/s]



count sample diff f1   is twice tot diff f1....... 887  / 1000    p < 0.887  
count sample diff prec is twice tot diff prec..... 906  / 1000    p < 0.906  
count sample diff rec  is twice tot diff rec ..... 670  / 1000    p < 0.67   
count sample diff acc  is twice tot diff acc...... 1000 / 1000    p < 1.0    [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 64 perc 5.89%', 'class 1 freq 238 perc 21.92%', 'class 2 freq 216 perc 19.89%', 'class 3 freq 568 perc 52.30%']
h1 preds count: ['class 0 freq 812 perc 74.77%', 'class 1 freq 173 perc 15.93%', 'class 2 freq 44 perc 4.05%', 'class 3 freq 57 perc 5.25%']
F-measure...... - h0: 0.5986  - h1: 0.1454  - diff: -0.4532
precision...... - h0: 0.6389  - h1: 0.2867  - diff: -0.3522
recall......... - h0: 0.5786  - h1: 0.2760  - diff: -0.3026
accuracy.

  _warn_prf(average, modifier, msg_start, len(result))
bootstrap: 100%|███████████████████████████| 1000/1000 [00:07<00:00, 125.24it/s]



count sample diff f1   is twice tot diff f1....... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff prec is twice tot diff prec..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff rec  is twice tot diff rec ..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff acc  is twice tot diff acc...... 1000 / 1000    p < 1.0    [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 64 perc 5.89%', 'class 1 freq 238 perc 21.92%', 'class 2 freq 216 perc 19.89%', 'class 3 freq 568 perc 52.30%']
h1 preds count: ['class 0 freq 7 perc 0.64%', 'class 1 freq 179 perc 16.48%', 'class 2 freq 11 perc 1.01%', 'class 3 freq 889 perc 81.86%']
F-measure...... - h0: 0.5986  - h1: 0.2621  - diff: -0.3365
precision...... - h0: 0.6389  - h1: 0.4440  - diff: -0.1949
recall......... - h0: 0.5786

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr


count sample diff f1   is twice tot diff f1....... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff prec is twice tot diff prec..... 870  / 1000    p < 0.87   
count sample diff rec  is twice tot diff rec ..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff acc  is twice tot diff acc...... 1000 / 1000    p < 1.0    [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 64 perc 5.89%', 'class 1 freq 238 perc 21.92%', 'class 2 freq 216 perc 19.89%', 'class 3 freq 568 perc 52.30%']
h1 preds count: ['class 0 freq 611 perc 56.26%', 'class 1 freq 248 perc 22.84%', 'class 2 freq 197 perc 18.14%', 'class 3 freq 30 perc 2.76%']
F-measure...... - h0: 0.5986  - h1: 0.1753  - diff: -0.4233
precision...... - h0: 0.6389  - h1: 0.2810  - diff: -0.3579
recall......... - h0: 0.5786  - h1: 0.2

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
bootstrap: 100%|███████████████████████████| 1000/1000 [00:07<00:00, 125.03it/s]



count sample diff f1   is twice tot diff f1....... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff prec is twice tot diff prec..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff rec  is twice tot diff rec ..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff acc  is twice tot diff acc...... 1000 / 1000    p < 1.0    [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 64 perc 5.89%', 'class 1 freq 238 perc 21.92%', 'class 2 freq 216 perc 19.89%', 'class 3 freq 568 perc 52.30%']
h1 preds count: ['class 0 freq 271 perc 24.95%', 'class 1 freq 387 perc 35.64%', 'class 2 freq 287 perc 26.43%', 'class 3 freq 141 perc 12.98%']
F-measure...... - h0: 0.5986  - h1: 0.2103  - diff: -0.3883
precision...... - h0: 0.6389  - h1: 0.2541  - diff: -0.3848
recall......... - h0: 0

bootstrap: 100%|███████████████████████████| 1000/1000 [00:07<00:00, 125.15it/s]



count sample diff f1   is twice tot diff f1....... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff prec is twice tot diff prec..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff rec  is twice tot diff rec ..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff acc  is twice tot diff acc...... 1000 / 1000    p < 1.0    [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 64 perc 5.89%', 'class 1 freq 238 perc 21.92%', 'class 2 freq 216 perc 19.89%', 'class 3 freq 568 perc 52.30%']
h1 preds count: ['class 0 freq 256 perc 23.57%', 'class 1 freq 372 perc 34.25%', 'class 2 freq 86 perc 7.92%', 'class 3 freq 372 perc 34.25%']
F-measure...... - h0: 0.5986  - h1: 0.2796  - diff: -0.3190
precision...... - h0: 0.6389  - h1: 0.2935  - diff: -0.3454
recall......... - h0: 0.5

bootstrap: 100%|███████████████████████████| 1000/1000 [00:07<00:00, 125.05it/s]



count sample diff f1   is twice tot diff f1....... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff prec is twice tot diff prec..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff rec  is twice tot diff rec ..... 1000 / 1000    p < 1.0    [38;5;8m![0m
count sample diff acc  is twice tot diff acc...... 1000 / 1000    p < 1.0    [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 812 perc 74.77%', 'class 1 freq 173 perc 15.93%', 'class 2 freq 44 perc 4.05%', 'class 3 freq 57 perc 5.25%']
h1 preds count: ['class 0 freq 7 perc 0.64%', 'class 1 freq 179 perc 16.48%', 'class 2 freq 11 perc 1.01%', 'class 3 freq 889 perc 81.86%']
F-measure...... - h0: 0.1454  - h1: 0.2621  - diff: 0.1167
precision...... - h0: 0.2867  - h1: 0.4440  - diff: 0.1573
recall......... - h0: 0.2760  - 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr


count sample diff f1   is twice tot diff f1....... 0    / 1000    p < 0.0    [38;5;9m**[0m
count sample diff prec is twice tot diff prec..... 98   / 1000    p < 0.098  
count sample diff rec  is twice tot diff rec ..... 287  / 1000    p < 0.287  
count sample diff acc  is twice tot diff acc...... 0    / 1000    p < 0.0    [38;5;9m**[0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 812 perc 74.77%', 'class 1 freq 173 perc 15.93%', 'class 2 freq 44 perc 4.05%', 'class 3 freq 57 perc 5.25%']
h1 preds count: ['class 0 freq 611 perc 56.26%', 'class 1 freq 248 perc 22.84%', 'class 2 freq 197 perc 18.14%', 'class 3 freq 30 perc 2.76%']
F-measure...... - h0: 0.1454  - h1: 0.1753  - diff: 0.0299
precision...... - h0: 0.2867  - h1: 0.2810  - diff: -0.0057
recall......... - h0: 0.2760  - h1: 0.2852  - diff: 0.

  _warn_prf(average, modifier, msg_start, len(result))
bootstrap: 100%|███████████████████████████| 1000/1000 [00:07<00:00, 125.51it/s]



count sample diff f1   is twice tot diff f1....... 156  / 1000    p < 0.156  
count sample diff prec is twice tot diff prec..... 562  / 1000    p < 0.562  
count sample diff rec  is twice tot diff rec ..... 407  / 1000    p < 0.407  
count sample diff acc  is twice tot diff acc...... 285  / 1000    p < 0.285  
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 812 perc 74.77%', 'class 1 freq 173 perc 15.93%', 'class 2 freq 44 perc 4.05%', 'class 3 freq 57 perc 5.25%']
h1 preds count: ['class 0 freq 271 perc 24.95%', 'class 1 freq 387 perc 35.64%', 'class 2 freq 287 perc 26.43%', 'class 3 freq 141 perc 12.98%']
F-measure...... - h0: 0.1454  - h1: 0.2103  - diff: 0.0649
precision...... - h0: 0.2867  - h1: 0.2541  - diff: -0.0326
recall......... - h0: 0.2760  - h1: 0.2563  - diff: -0.0197
accuracy....... - h0: 

  _warn_prf(average, modifier, msg_start, len(result))
bootstrap: 100%|███████████████████████████| 1000/1000 [00:07<00:00, 125.30it/s]



count sample diff f1   is twice tot diff f1....... 29   / 1000    p < 0.029  [38;5;9m*[0m
count sample diff prec is twice tot diff prec..... 688  / 1000    p < 0.688  
count sample diff rec  is twice tot diff rec ..... 682  / 1000    p < 0.682  
count sample diff acc  is twice tot diff acc...... 38   / 1000    p < 0.038  [38;5;9m*[0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 812 perc 74.77%', 'class 1 freq 173 perc 15.93%', 'class 2 freq 44 perc 4.05%', 'class 3 freq 57 perc 5.25%']
h1 preds count: ['class 0 freq 256 perc 23.57%', 'class 1 freq 372 perc 34.25%', 'class 2 freq 86 perc 7.92%', 'class 3 freq 372 perc 34.25%']
F-measure...... - h0: 0.1454  - h1: 0.2796  - diff: 0.1342
precision...... - h0: 0.2867  - h1: 0.2935  - diff: 0.0068
recall......... - h0: 0.2760  - h1: 0.3191  - diff: 0.043

bootstrap: 100%|███████████████████████████| 1000/1000 [00:07<00:00, 125.68it/s]



count sample diff f1   is twice tot diff f1....... 0    / 1000    p < 0.0    [38;5;9m**[0m
count sample diff prec is twice tot diff prec..... 461  / 1000    p < 0.461  
count sample diff rec  is twice tot diff rec ..... 172  / 1000    p < 0.172  
count sample diff acc  is twice tot diff acc...... 0    / 1000    p < 0.0    [38;5;9m**[0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 7 perc 0.64%', 'class 1 freq 179 perc 16.48%', 'class 2 freq 11 perc 1.01%', 'class 3 freq 889 perc 81.86%']
h1 preds count: ['class 0 freq 611 perc 56.26%', 'class 1 freq 248 perc 22.84%', 'class 2 freq 197 perc 18.14%', 'class 3 freq 30 perc 2.76%']
F-measure...... - h0: 0.2621  - h1: 0.1753  - diff: -0.0868
precision...... - h0: 0.4440  - h1: 0.2810  - diff: -0.1630
recall......... - h0: 0.2947  - h1: 0.2852  - diff: -0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr


count sample diff f1   is twice tot diff f1....... 990  / 1000    p < 0.99   [38;5;8m![0m
count sample diff prec is twice tot diff prec..... 900  / 1000    p < 0.9    
count sample diff rec  is twice tot diff rec ..... 599  / 1000    p < 0.599  
count sample diff acc  is twice tot diff acc...... 1000 / 1000    p < 1.0    [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 7 perc 0.64%', 'class 1 freq 179 perc 16.48%', 'class 2 freq 11 perc 1.01%', 'class 3 freq 889 perc 81.86%']
h1 preds count: ['class 0 freq 271 perc 24.95%', 'class 1 freq 387 perc 35.64%', 'class 2 freq 287 perc 26.43%', 'class 3 freq 141 perc 12.98%']
F-measure...... - h0: 0.2621  - h1: 0.2103  - diff: -0.0518
precision...... - h0: 0.4440  - h1: 0.2541  - diff: -0.1899
recall......... - h0: 0.2947  - h1: 0.2563  - diff: -0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr


count sample diff f1   is twice tot diff f1....... 928  / 1000    p < 0.928  
count sample diff prec is twice tot diff prec..... 935  / 1000    p < 0.935  
count sample diff rec  is twice tot diff rec ..... 844  / 1000    p < 0.844  
count sample diff acc  is twice tot diff acc...... 1000 / 1000    p < 1.0    [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 7 perc 0.64%', 'class 1 freq 179 perc 16.48%', 'class 2 freq 11 perc 1.01%', 'class 3 freq 889 perc 81.86%']
h1 preds count: ['class 0 freq 256 perc 23.57%', 'class 1 freq 372 perc 34.25%', 'class 2 freq 86 perc 7.92%', 'class 3 freq 372 perc 34.25%']
F-measure...... - h0: 0.2621  - h1: 0.2796  - diff: 0.0175
precision...... - h0: 0.4440  - h1: 0.2935  - diff: -0.1505
recall......... - h0: 0.2947  - h1: 0.3191  - diff: 0.0244
accuracy....

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr


count sample diff f1   is twice tot diff f1....... 327  / 1000    p < 0.327  
count sample diff prec is twice tot diff prec..... 890  / 1000    p < 0.89   
count sample diff rec  is twice tot diff rec ..... 276  / 1000    p < 0.276  
count sample diff acc  is twice tot diff acc...... 1000 / 1000    p < 1.0    [38;5;8m![0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 611 perc 56.26%', 'class 1 freq 248 perc 22.84%', 'class 2 freq 197 perc 18.14%', 'class 3 freq 30 perc 2.76%']
h1 preds count: ['class 0 freq 271 perc 24.95%', 'class 1 freq 387 perc 35.64%', 'class 2 freq 287 perc 26.43%', 'class 3 freq 141 perc 12.98%']
F-measure...... - h0: 0.1753  - h1: 0.2103  - diff: 0.0350
precision...... - h0: 0.2810  - h1: 0.2541  - diff: -0.0269
recall......... - h0: 0.2852  - h1: 0.2563  - diff: -0.0289
accura

bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 124.95it/s]



count sample diff f1   is twice tot diff f1....... 128  / 1000    p < 0.128  
count sample diff prec is twice tot diff prec..... 663  / 1000    p < 0.663  
count sample diff rec  is twice tot diff rec ..... 735  / 1000    p < 0.735  
count sample diff acc  is twice tot diff acc...... 81   / 1000    p < 0.081  
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 611 perc 56.26%', 'class 1 freq 248 perc 22.84%', 'class 2 freq 197 perc 18.14%', 'class 3 freq 30 perc 2.76%']
h1 preds count: ['class 0 freq 256 perc 23.57%', 'class 1 freq 372 perc 34.25%', 'class 2 freq 86 perc 7.92%', 'class 3 freq 372 perc 34.25%']
F-measure...... - h0: 0.1753  - h1: 0.2796  - diff: 0.1043
precision...... - h0: 0.2810  - h1: 0.2935  - diff: 0.0125
recall......... - h0: 0.2852  - h1: 0.3191  - diff: 0.0339
accuracy....... - h0: 0.

  _warn_prf(average, modifier, msg_start, len(result))
bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 123.74it/s]



count sample diff f1   is twice tot diff f1....... 2    / 1000    p < 0.002  [38;5;9m**[0m
count sample diff prec is twice tot diff prec..... 426  / 1000    p < 0.426  
count sample diff rec  is twice tot diff rec ..... 242  / 1000    p < 0.242  
count sample diff acc  is twice tot diff acc...... 0    / 1000    p < 0.0    [38;5;9m**[0m
data shape:  (1086, 1)
sample size: 217
h0: h0 - h1: h1
targs count:    ['class 0 freq 105 perc 9.67%', 'class 1 freq 214 perc 19.71%', 'class 2 freq 214 perc 19.71%', 'class 3 freq 553 perc 50.92%']
h0 preds count: ['class 0 freq 271 perc 24.95%', 'class 1 freq 387 perc 35.64%', 'class 2 freq 287 perc 26.43%', 'class 3 freq 141 perc 12.98%']
h1 preds count: ['class 0 freq 256 perc 23.57%', 'class 1 freq 372 perc 34.25%', 'class 2 freq 86 perc 7.92%', 'class 3 freq 372 perc 34.25%']
F-measure...... - h0: 0.2103  - h1: 0.2796  - diff: 0.0693
precision...... - h0: 0.2541  - h1: 0.2935  - diff: 0.0394
recall......... - h0: 0.2563  - h1: 0.3191  - diff:

bootstrap: 100%|███████████████████████████| 1000/1000 [00:08<00:00, 124.73it/s]


count sample diff f1   is twice tot diff f1....... 38   / 1000    p < 0.038  [38;5;9m*[0m
count sample diff prec is twice tot diff prec..... 192  / 1000    p < 0.192  
count sample diff rec  is twice tot diff rec ..... 127  / 1000    p < 0.127  
count sample diff acc  is twice tot diff acc...... 1    / 1000    p < 0.001  [38;5;9m**[0m





In [72]:
output.to_csv('F_score_diff_for_age.csv')