Creating a balanced test set for flat training (cross validation)

In [1]:
import os
import yaml
import pickle

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

In [2]:
cd ../../src/

/Users/cock/kDrive/PhD/Projects/Labs/beerslaw-lab/src


# Data


In [3]:
# experiment
with open('../data/experiment_keys/over10.pkl', 'rb') as fp:
    over10 = pickle.load(fp)

# data
with open('../data/post_test/rankings_scored.pkl', 'rb') as fp:
    rankings = pickle.load(fp)
    rankings = rankings[rankings['username'].isin(over10)]
    
# vector map
map_path = '../data/experiment_keys/permutation_maps/vector_binary.yaml'
with open(map_path) as fp:
    vector_map = yaml.load(fp, Loader=yaml.FullLoader)

In [4]:
rankings['prior_4cat_knowledge'] = rankings['prior_4cat'].apply(lambda x: x[0] if x != 'none' else 0)
rankings['total_score'] = rankings['q1_score'] + rankings['q2_score'] + rankings['q3_score'] + rankings['q4_score'] + rankings['q5_score'] + rankings['q6_score']
rankings['vector_binary'] = rankings['ranking'].apply(lambda x: vector_map['map'][x])
rankings['vector_score'] = rankings['vector_binary'].apply(lambda x: int(str(x)[0]) + int(str(x)[1]) + int(str(x)[2]))
rankings['vector_binconcepts'] = rankings['vector_score'].apply(lambda x: int(x >= 2))

In [5]:
# stratification columns (ideal separation)
stratification = [
    'ranking', 'year', 'gender', 'prior_4cat_knowledge', 'total_score', 'vector_binary',
    'vector_score', 'vector_binconcepts'
]


In [6]:
train, test = train_test_split(rankings, test_size=0.20, random_state=0, stratify=rankings[['ranking']])

In [7]:
for strat in stratification:
    
    test_strat = test[[strat, 'username']].groupby(strat).nunique().reset_index()
    test_strat['username'] = test_strat['username'] / sum(test_strat['username'])
    test_strat.columns = [strat, 'test']
    
    train_strat = train[[strat, 'username']].groupby(strat).nunique().reset_index()
    train_strat['username'] = train_strat['username'] / sum(train_strat['username'])
    train_strat.columns = [strat, 'train']
    
    strat_df = test_strat.merge(train_strat, on=strat, how='inner')
    
    print(strat)
    display(strat_df)
    print()

ranking


Unnamed: 0,ranking,test,train
0,123,0.019608,0.029557
1,213,0.078431,0.083744
2,231,0.117647,0.1133
3,312,0.039216,0.039409
4,321,0.019608,0.024631
5,1023,0.019608,0.009852
6,1032,0.019608,0.009852
7,1203,0.019608,0.009852
8,1320,0.019608,0.019704
9,2013,0.078431,0.064039



year


Unnamed: 0,year,test,train
0,1st,0.372549,0.374384
1,2nd,0.431373,0.433498
2,3rd,0.196078,0.192118



gender


Unnamed: 0,gender,test,train
0,1,0.588235,0.507389
1,2,0.352941,0.453202
2,4,0.058824,0.019704



prior_4cat_knowledge


Unnamed: 0,prior_4cat_knowledge,test,train
0,0,0.098039,0.118227
1,0,0.254902,0.26601
2,1,0.176471,0.226601
3,2,0.176471,0.064039
4,3,0.294118,0.325123



total_score


Unnamed: 0,total_score,test,train
0,0,0.078431,0.044335
1,1,0.098039,0.152709
2,2,0.098039,0.123153
3,3,0.294118,0.142857
4,4,0.254902,0.26601
5,5,0.156863,0.20197
6,6,0.019608,0.068966



vector_binary


Unnamed: 0,vector_binary,test,train
0,0,0.333333,0.339901
1,1,0.098039,0.098522
2,10,0.039216,0.044335
3,11,0.137255,0.142857
4,100,0.078431,0.083744
5,101,0.117647,0.1133
6,110,0.078431,0.064039
7,111,0.117647,0.1133



vector_score


Unnamed: 0,vector_score,test,train
0,0,0.333333,0.339901
1,1,0.215686,0.226601
2,2,0.333333,0.320197
3,3,0.117647,0.1133



vector_binconcepts


Unnamed: 0,vector_binconcepts,test,train
0,0,0.54902,0.566502
1,1,0.45098,0.433498





In [8]:
with open('../data/experiment_keys/flatstrat_testusernames.pkl', 'wb') as fp:
    pickle.dump(list(test['username']), fp)
with open('../data/experiment_keys/flatstrat_trainusernames.pkl', 'wb') as fp:
    pickle.dump(list(train['username']), fp)

In [9]:
print('There are {} train instances, and {} test instances'.format(len(train), len(test)))

There are 203 train instances, and 51 test instances
