Creating a balanced test set for flat training (cross validation)

In [1]:
import os
import yaml
import pickle

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

In [2]:
cd ../../src/

/Users/cock/kDrive/PhD/Projects/Labs/beerslaw-lab/src


# Data


In [3]:
# experiment
with open('../data/experiment_keys/over10.pkl', 'rb') as fp:
    over10 = pickle.load(fp)

# data
with open('../data/post_test/rankings_scored.pkl', 'rb') as fp:
    rankings = pickle.load(fp)
    rankings = rankings[rankings['username'].isin(over10)]
    
# vector map
map_path = '../data/experiment_keys/permutation_maps/vector_binary.yaml'
with open(map_path) as fp:
    vector_map = yaml.load(fp, Loader=yaml.FullLoader)

In [4]:
rankings['prior_4cat_knowledge'] = rankings['prior_4cat'].apply(lambda x: x[0] if x != 'none' else 0)
rankings['total_score'] = rankings['q1_score'] + rankings['q2_score'] + rankings['q3_score'] + rankings['q4_score'] + rankings['q5_score'] + rankings['q6_score']
rankings['vector_binary'] = rankings['ranking'].apply(lambda x: vector_map['map'][x])
rankings['vector_score'] = rankings['vector_binary'].apply(lambda x: int(str(x)[0]) + int(str(x)[1]) + int(str(x)[2]))
rankings['vector_binconcepts'] = rankings['vector_score'].apply(lambda x: int(x >= 2))

In [5]:
# stratification columns (ideal separation)
stratification = [
    'ranking', 'year', 'gender', 'prior_4cat_knowledge', 'total_score', 'vector_binary',
    'vector_score', 'vector_binconcepts'
]


In [6]:
train, test = train_test_split(rankings, test_size=0.20, random_state=0, stratify=rankings[['ranking']])

In [7]:
for strat in stratification:
    
    test_strat = test[[strat, 'username']].groupby(strat).nunique().reset_index()
    test_strat['username'] = test_strat['username'] / sum(test_strat['username'])
    test_strat.columns = [strat, 'test']
    
    train_strat = train[[strat, 'username']].groupby(strat).nunique().reset_index()
    train_strat['username'] = train_strat['username'] / sum(train_strat['username'])
    train_strat.columns = [strat, 'train']
    
    strat_df = test_strat.merge(train_strat, on=strat, how='inner')
    
    print(strat)
    display(strat_df)
    print()

ranking


Unnamed: 0,ranking,test,train
0,123,0.019231,0.029412
1,213,0.076923,0.083333
2,231,0.115385,0.112745
3,312,0.038462,0.039216
4,321,0.019231,0.02451
5,1023,0.019231,0.009804
6,1032,0.019231,0.009804
7,1203,0.019231,0.009804
8,1320,0.019231,0.019608
9,2013,0.076923,0.063725



year


Unnamed: 0,year,test,train
0,1st,0.384615,0.377451
1,2nd,0.365385,0.446078
2,3rd,0.25,0.176471



gender


Unnamed: 0,gender,test,train
0,1,0.596154,0.509804
1,2,0.365385,0.446078
2,4,0.038462,0.02451



prior_4cat_knowledge


Unnamed: 0,prior_4cat_knowledge,test,train
0,0,0.115385,0.112745
1,0,0.307692,0.259804
2,1,0.153846,0.230392
3,2,0.173077,0.063725
4,3,0.25,0.333333



total_score


Unnamed: 0,total_score,test,train
0,0,0.057692,0.053922
1,1,0.115385,0.147059
2,2,0.115385,0.122549
3,3,0.173077,0.171569
4,4,0.307692,0.25
5,5,0.153846,0.20098
6,6,0.076923,0.053922



vector_binary


Unnamed: 0,vector_binary,test,train
0,0,0.346154,0.338235
1,1,0.096154,0.102941
2,10,0.038462,0.044118
3,11,0.134615,0.142157
4,100,0.076923,0.083333
5,101,0.115385,0.112745
6,110,0.076923,0.063725
7,111,0.115385,0.112745



vector_score


Unnamed: 0,vector_score,test,train
0,0,0.346154,0.338235
1,1,0.211538,0.230392
2,2,0.326923,0.318627
3,3,0.115385,0.112745



vector_binconcepts


Unnamed: 0,vector_binconcepts,test,train
0,0,0.557692,0.568627
1,1,0.442308,0.431373





In [8]:
with open('../data/experiment_keys/flatstrat_testusernames.pkl', 'wb') as fp:
    pickle.dump(list(test['username']), fp)
with open('../data/experiment_keys/flatstrat_trainusernames.pkl', 'wb') as fp:
    pickle.dump(list(train['username']), fp)

In [9]:
print('There are {} train instances, and {} test instances'.format(len(train), len(test)))

There are 204 train instances, and 52 test instances
