<h3 style="color:royalblue"> This notebook's purpose is to prepare the data for the logistic regression to be conducted in R.</h3>

In [1]:
cd ../../../src

/Users/cock/kDrive/PhD/Projects/Labs/beerslaw-lab/src


In [2]:
import os
import re
import dill
import pickle

import pandas as pd 

from extractors.sequencer.flat.simplemorestates_secondsflat import SimpleMoreStateSecondsFlat
from extractors.pipeline_maker import PipelineMaker

# Make the sequences

In [3]:
settings = {
    'data': {
        'pipeline': {
            'sequencer_dragasclick': True,
            'break_threshold': 0.6
        }
    }
}

In [4]:
# Where to find the simulation objects
parsed_simulation_path = '../data/beerslaw/parsed simulations/'
files = os.listdir(parsed_simulation_path)
files = [f for f in files if 'simulation' in f]
files = [f for f in files if '_t2v_' in f]

# Load demographics information
with open('../data/beerslaw/post_test/rankings.pkl', 'rb') as fp:
    ranks = pickle.load(fp)
    ranks = ranks.set_index('username')

# useful regex
id_regex = re.compile('lid([^_]+)_')

In [5]:
# Load sequencer
sequencer = SimpleMoreStateSecondsFlat(settings)
sequencer.set_rankings(ranks)

In [6]:
# Parse
id_dictionary = {'index':{}, 'sequences': {}}
i = 0
while len(files) != 0:
    file = files[0]
    lid = id_regex.findall(file)[0]
    print(file)
    print('   {}'.format(lid))
    
    
    # Retrieve demographics
    try:
        permutation = ranks.loc[lid]['ranking']
        gender = ranks.loc[lid]['gender']
        year = ranks.loc[lid]['year']
        field = ranks.loc[lid]['field']
        language = ranks.loc[lid]['language']
        print('    ranking: {}, gender: {}, year: {}, field: {}, language: {}'.format(
            permutation, gender, year, field, language
        ))
    except KeyError:
        print('    No ranking or demographics')
        files_noranking = []
        files_noranking.append('permmissing_lid' + str(lid) + '_t1v_simulation.pkl')
        files_noranking.append('permmissing_lid' + str(lid) + '_t2v_simulation.pkl')
        files_noranking.append('permmissing_lid' + str(lid) + '_t3v_simulation.pkl')
        files_noranking.append('permwrong field_lid' + str(lid) + '_t1v_simulation.pkl')
        files_noranking.append('permwrong field_lid' + str(lid) + '_t2v_simulation.pkl')
        files_noranking.append('permwrong field_lid' + str(lid) + '_t3v_simulation.pkl')
        for f in files_noranking:
            if f in files:
                files.remove(f)
                print(f)
        continue
        
    
    # Retrieve actions    
    file_path = 'perm{}_lid{}_t2v_simulation.pkl'.format(permutation, lid)
    try:
        with open(parsed_simulation_path + file_path, 'rb') as fp:
            sim = dill.load(fp)
            sim.set_permutation(permutation)
            sim.save()
        if file_path in files:
            files.remove(file_path)
        file_path1 = 'perm{}_lid{}_t1v_simulation.pkl'.format(permutation, lid)
        if file_path1 in files:
            files.remove(file_path1)
        file_path3 = 'perm{}_lid{}_t3v_simulation.pkl'.format(permutation, lid)
        if file_path3 in files:
            files.remove(file_path3)
        labels, begins, ends = sequencer.get_sequences(sim, lid)
        last_timestamp = sim.get_last_timestamp()
    except FileNotFoundError:
        labels, begins, ends = [], [], []
        last_timestamp = 0
    except TypeError:
        labels, begins, ends = [], [], []
        last_timestamp = 0

    sim_dict = {
        'sequence': labels,
        'begin': begins,
        'end': ends,
        'permutation': permutation,
        'last_timestamp': last_timestamp,
        'learner_id': str(lid),
        'gender': gender,
        'year': year,
        'field': field,
        'language': language
    }
    path = '../data/beerslaw/logistic_regression/sequenced/simplemorestates_secondsflat/p{}_lid{}_t2_sequenced.pkl'.format(
        permutation, lid,
    )
    with open(path, 'wb') as fp:
        pickle.dump(sim_dict, fp)

    id_dictionary['sequences'][i] = {
        'path': path,
        'length': len(labels),
        'learner_id': str(lid),
        'gender': gender,
        'year': year,
        'field': field,
        'language': language
    }
    id_dictionary['index'][str(lid)] = i
    i += 1
    
with open('../data/beerslaw/logistic_regression/sequenced/simplemorestates_secondsflat/id_dictionary.pkl', 'wb') as fp:
    pickle.dump(id_dictionary, fp)

perm2013_lidsvdphyjs_t2v_simulation.pkl
   svdphyjs
    ranking: 2013, gender: 2, year: 1st, field: Chemistry, language: Deutsch
perm2103_lidwyj76ntd_t2v_simulation.pkl
   wyj76ntd
    ranking: 2103, gender: 1, year: 1st, field: Pharma Chemistry, language: Deutsch
perm3210_lidnmptx7nj_t2v_simulation.pkl
   nmptx7nj
    ranking: 3210, gender: 1, year: 1st, field: Chemistry, language: Français
perm3120_lidgc663sap_t2v_simulation.pkl
   gc663sap
    ranking: 3120, gender: 1, year: 2nd, field: Chemistry, language: Français
perm0213_lid8nh4zvcp_t2v_simulation.pkl
   8nh4zvcp
    ranking: 0213, gender: 1, year: 3rd, field: Chemistry, Textiles, language: Deutsch
perm3012_lidn3977erj_t2v_simulation.pkl
   n3977erj
    ranking: 3012, gender: 2, year: 1st, field: Fast track, language: Français
perm3210_liduad7cmhv_t2v_simulation.pkl
   uad7cmhv
    ranking: 3210, gender: 1, year: 1st, field: Chemistry, language: Français
permmissing_lidrwu8yjw9_t2v_simulation.pkl
   rwu8yjw9
    No ranking or de

perm3120_lidwpszzhxa_t2v_simulation.pkl
   wpszzhxa
    ranking: 3120, gender: 2, year: 2nd, field: Chemistry, language: Deutsch
perm2103_lidcrak75dx_t2v_simulation.pkl
   crak75dx
    ranking: 2103, gender: 2, year: 2nd, field: Biology, language: Deutsch
perm3012_lidaddf7f7d_t2v_simulation.pkl
   addf7f7d
    ranking: 3012, gender: 1, year: 1st, field: Chemistry, language: Français
permmissing_lidx6e99ywy_t2v_simulation.pkl
   x6e99ywy
    No ranking or demographics
permmissing_lidx6e99ywy_t2v_simulation.pkl
perm2301_lid84nmc3df_t2v_simulation.pkl
   84nmc3df
    ranking: 2301, gender: 1, year: 2nd, field: Chemistry, language: Français
permmissing_lidtf97geej_t2v_simulation.pkl
   tf97geej
    No ranking or demographics
permmissing_lidtf97geej_t2v_simulation.pkl
perm3210_lidurwu33jd_t2v_simulation.pkl
   urwu33jd
    ranking: 3210, gender: 1, year: 1st, field: Chemistry, language: Français
perm2031_lid4k4kc2k6_t2v_simulation.pkl
   4k4kc2k6
    ranking: 2031, gender: 1, year: 2nd, fie

perm3120_lidmwfdr4ys_t2v_simulation.pkl
   mwfdr4ys
    ranking: 3120, gender: 2, year: 1st, field: Chemistry, language: Deutsch
perm2301_liddnvedphf_t2v_simulation.pkl
   dnvedphf
    ranking: 2301, gender: 2, year: 1st, field: Chemistry, language: Deutsch
perm0231_lid6tg95rzr_t2v_simulation.pkl
   6tg95rzr
    ranking: 0231, gender: 2, year: 3rd, field: Chemistry, language: Deutsch
perm0231_lidfj5tdybn_t2v_simulation.pkl
   fj5tdybn
    ranking: 0231, gender: 1, year: 3rd, field: Chemistry, Textiles, language: Deutsch
perm3021_lid7xhcecye_t2v_simulation.pkl
   7xhcecye
    ranking: 3021, gender: 2, year: 1st, field: Chemistry, Textiles, language: Deutsch
perm3120_lidxj86wyup_t2v_simulation.pkl
   xj86wyup
    ranking: 3120, gender: 2, year: 2nd, field: Chemistry, language: Français
perm3210_lidwguewwkp_t2v_simulation.pkl
   wguewwkp
    ranking: 3210, gender: 1, year: 1st, field: Fast track, language: Français
perm0231_lidtsvcrpeg_t2v_simulation.pkl
   tsvcrpeg
    ranking: 0231, gen

permmissing_lidh8624awf_t2v_simulation.pkl
   h8624awf
    No ranking or demographics
permmissing_lidh8624awf_t2v_simulation.pkl
perm2031_lidh75kp27p_t2v_simulation.pkl
   h75kp27p
    ranking: 2031, gender: 1, year: 3rd, field: Chemistry, language: Deutsch
perm2130_lid26z3wbqz_t2v_simulation.pkl
   26z3wbqz
    ranking: 2130, gender: 2, year: 3rd, field: Chemistry, language: Deutsch
perm3012_lid2ep3hayy_t2v_simulation.pkl
   2ep3hayy
    ranking: 3012, gender: 2, year: 2nd, field: Chemistry, language: Deutsch
perm1023_lidrwax4gk7_t2v_simulation.pkl
   rwax4gk7
    ranking: 1023, gender: 1, year: 1st, field: Chemistry, language: Deutsch
perm2103_lidybxahrra_t2v_simulation.pkl
   ybxahrra
    ranking: 2103, gender: 2, year: 2nd, field: Chemistry, language: Deutsch
perm0123_lid6h5vmwys_t2v_simulation.pkl
   6h5vmwys
    ranking: 0123, gender: 1, year: 1st, field: Chemistry, language: Deutsch
perm2031_lidrgeb7wr9_t2v_simulation.pkl
   rgeb7wr9
    ranking: 2031, gender: 1, year: 1st, fiel

perm0321_lidxmh5qd3z_t2v_simulation.pkl
   xmh5qd3z
    ranking: 0321, gender: 2, year: 2nd, field: Chemistry, language: Français
perm2310_lidedmrufua_t2v_simulation.pkl
   edmrufua
    ranking: 2310, gender: 1, year: 2nd, field: Chemistry, language: Français
permmissing_lidn238pypu_t2v_simulation.pkl
   n238pypu
    No ranking or demographics
permmissing_lidn238pypu_t2v_simulation.pkl
perm3012_lidzs35fg8g_t2v_simulation.pkl
   zs35fg8g
    ranking: 3012, gender: 1, year: 3rd, field: Chemistry, language: Deutsch
perm2031_lid9p9gwu88_t2v_simulation.pkl
   9p9gwu88
    ranking: 2031, gender: 1, year: 2nd, field: Chemistry, language: Deutsch
perm3120_lidxpyjyx4m_t2v_simulation.pkl
   xpyjyx4m
    ranking: 3120, gender: 2, year: 2nd, field: Chemistry, language: Deutsch
perm0231_lidp5t7d3sb_t2v_simulation.pkl
   p5t7d3sb
    ranking: 0231, gender: 2, year: 2nd, field: Chemistry, language: Deutsch
perm3012_lidn6zakrku_t2v_simulation.pkl
   n6zakrku
    ranking: 3012, gender: 2, year: 1st, fi

perm2031_lidfu6nsdhs_t2v_simulation.pkl
   fu6nsdhs
    ranking: 2031, gender: 1, year: 2nd, field: Chemistry, language: Deutsch
perm3210_lid55yavcue_t2v_simulation.pkl
   55yavcue
    ranking: 3210, gender: 1, year: 2nd, field: Chemistry, language: Deutsch
perm2103_lidfw2ajjmt_t2v_simulation.pkl
   fw2ajjmt
    ranking: 2103, gender: 2, year: 1st, field: Biology, language: Français
perm0321_lid7zjqat99_t2v_simulation.pkl
   7zjqat99
    ranking: 0321, gender: 1, year: 2nd, field: Chemistry, language: Français
perm3102_lid9pjrsbth_t2v_simulation.pkl
   9pjrsbth
    ranking: 3102, gender: 1, year: 2nd, field: Chemistry, language: Français
perm3120_lid982cf4dn_t2v_simulation.pkl
   982cf4dn
    ranking: 3120, gender: 2, year: 1st, field: Chemistry, language: Deutsch
perm2130_lidnmgve3yy_t2v_simulation.pkl
   nmgve3yy
    ranking: 2130, gender: 2, year: 2nd, field: Chemistry, language: Deutsch
perm2310_lidjkbx6axr_t2v_simulation.pkl
   jkbx6axr
    ranking: 2310, gender: 1, year: 2nd, fie

perm3201_lidzgs84jfy_t2v_simulation.pkl
   zgs84jfy
    ranking: 3201, gender: 1, year: 2nd, field: Chemistry, language: Deutsch
permmissing_lidww2m3qch_t2v_simulation.pkl
   ww2m3qch
    No ranking or demographics
permmissing_lidww2m3qch_t2v_simulation.pkl
permmissing_lidp5ac3mu7_t2v_simulation.pkl
   p5ac3mu7
    No ranking or demographics
permmissing_lidp5ac3mu7_t2v_simulation.pkl
permmissing_lidb25u3a4u_t2v_simulation.pkl
   b25u3a4u
    No ranking or demographics
permmissing_lidb25u3a4u_t2v_simulation.pkl
perm3120_lidjcrv9m9j_t2v_simulation.pkl
   jcrv9m9j
    ranking: 3120, gender: 1, year: 1st, field: Chemistry, language: Deutsch
perm0213_lid9sgu2tbg_t2v_simulation.pkl
   9sgu2tbg
    ranking: 0213, gender: 1, year: 1st, field: Chemistry, language: Français
perm3012_lidnfshnktq_t2v_simulation.pkl
   nfshnktq
    ranking: 3012, gender: 2, year: 2nd, field: Biology, language: Deutsch
perm2031_lid9xc2v9vu_t2v_simulation.pkl
   9xc2v9vu
    ranking: 2031, gender: 1, year: 2nd, field

perm3120_lidt3dwkezr_t2v_simulation.pkl
   t3dwkezr
    ranking: 3120, gender: 2, year: 3rd, field: Chemistry, language: Deutsch
perm1230_lidqjfn2j8z_t2v_simulation.pkl
   qjfn2j8z
    ranking: 1230, gender: 1, year: 1st, field: Pharma Chemistry, language: Deutsch
perm2013_lid47ce49e4_t2v_simulation.pkl
   47ce49e4
    ranking: 2013, gender: 2, year: 3rd, field: Chemistry, Textiles, language: Deutsch
perm1023_lidg4xpwwac_t2v_simulation.pkl
   g4xpwwac
    ranking: 1023, gender: 1, year: 1st, field: Chemistry, language: Deutsch
perm3012_lid9wdm7zue_t2v_simulation.pkl
   9wdm7zue
    ranking: 3012, gender: 2, year: 1st, field: Chemistry, language: Français
perm3201_lidszvqb37f_t2v_simulation.pkl
   szvqb37f
    ranking: 3201, gender: 2, year: 1st, field: Chemistry, language: Deutsch
perm2130_lidhrhu4yph_t2v_simulation.pkl
   hrhu4yph
    ranking: 2130, gender: 1, year: 1st, field: Biology, language: Français
permmissing_lidktd99zh9_t2v_simulation.pkl
   ktd99zh9
    No ranking or demogra

# Sequence to Features

In [6]:
# settings
settings_pipeline = {
    'experiment':{
        'class_name': 'binconcepts',
        'root_name': 'logistic_regression_R',
        'name': 'logistic_regression',
    },
    'data': {
        'pipeline': {
            'sequencer': 'simplemorestates_secondsflat',
            'sequencer_dragasclick': True,
            'break_threshold': 0.6,
            'concatenator': {
                'type': 'chemconcat',
                'tasks': ['2']
            },
            'demographic_filter': 'chemlab',
            'event_filter': 'nofilt',
            'break_threshold': 0.6,
            'break_filter': 'nobrfilt',
            'adjuster': 'full'
        },
        'min_length': 0,
        'filters': {'interactionlimit':10},
        'adjuster': {'limit':300}
    },
    'ML': {
        'pipeline': {
            'scorer': 'none',
        },
    },
    'paths': {'sequenced_simulations': '../data/beerslaw/logistic_regression/sequenced/'}
}

## Action Count

In [7]:
# Parameters for action count
settings_pipeline['data']['pipeline']['encoder'] = '1hot'
settings_pipeline['data']['pipeline']['aggregator'] = 'aveagg'

# Create data
pipeline = PipelineMaker(settings_pipeline)
sequences, labels, indices, id_dictionary = pipeline.build_data()

# Create DF
data_r = []
for i, seq in enumerate(sequences):
    row = [id_dictionary['sequences'][indices[i]]['learner_id']] + list(seq) + [labels[i]]
    data_r.append(row)

columns = ['lid'] + list(sequencer.get_states()) + ['binconcepts']
df_r = pd.DataFrame(data_r, columns=columns)
df_r['permutation'] = df_r['lid'].apply(lambda x: ranks.loc[x]['ranking'])
df_r['language'] = df_r['lid'].apply(lambda x: ranks.loc[x]['language'])
df_r['field'] = df_r['lid'].apply(lambda x: ranks.loc[x]['field'])
df_r['year'] = df_r['lid'].apply(lambda x: ranks.loc[x]['year'])
df_r['gender'] = df_r['lid'].apply(lambda x: ranks.loc[x]['gender'])

# Action count
df_r.to_csv('../data/beerslaw/logistic_regression/action_count.tsv', sep='\t')

In [8]:
df_r

Unnamed: 0,lid,greengreen_other,greenred_other,notgreennotred_other,noobserved_other,greengreen_concentration,greenred_concentration,notgreennotred_concentration,noobserved_concentration,greengreen_width,...,greengreen_break,greenred_break,notgreennotred_break,noobserved_break,binconcepts,permutation,language,field,year,gender
0,svdphyjs,0.115789,0.063158,0.021053,0.031579,0.021053,0.052632,0.010526,0.000000,0.010526,...,0.094737,0.084211,0.042105,0.052632,1,2013,Deutsch,Chemistry,1st,2
1,gc663sap,0.000000,0.015625,0.000000,0.031250,0.000000,0.000000,0.000000,0.109375,0.000000,...,0.000000,0.031250,0.000000,0.234375,1,3120,Français,Chemistry,2nd,1
2,8nh4zvcp,0.000000,0.019608,0.000000,0.098039,0.000000,0.000000,0.000000,0.313725,0.000000,...,0.000000,0.029412,0.000000,0.245098,0,0213,Deutsch,"Chemistry, Textiles",3rd,1
3,5f4q4ng5,0.000000,0.013699,0.075342,0.150685,0.000000,0.000000,0.027397,0.068493,0.000000,...,0.000000,0.000000,0.068493,0.205479,0,3021,Français,Chemistry,1st,1
4,ujpk3gf4,0.000000,0.044444,0.000000,0.044444,0.000000,0.088889,0.000000,0.177778,0.000000,...,0.000000,0.133333,0.000000,0.111111,0,3012,Deutsch,"Chemistry, Textiles",3rd,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,m749z8u9,0.000000,0.008516,0.000000,0.000000,0.590024,0.107056,0.000000,0.000000,0.002433,...,0.197080,0.055961,0.000000,0.003650,1,2031,Deutsch,"Chemistry, Textiles",3rd,2
250,88kjzd8b,0.000000,0.016949,0.000000,0.033898,0.118644,0.203390,0.000000,0.000000,0.067797,...,0.067797,0.169492,0.016949,0.016949,1,0231,Deutsch,Chemistry,3rd,2
251,zkrr45y5,0.000000,0.000000,0.000000,0.025641,0.000000,0.000000,0.000000,0.128205,0.000000,...,0.025641,0.025641,0.000000,0.217949,1,2013,Deutsch,"Chemistry, Textiles",1st,1
252,26mtbtye,0.000000,0.000000,0.000000,0.045455,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.181818,0,3021,Français,Chemistry,2nd,1


## Action Span

In [24]:
# Parameters for action span
settings_pipeline['data']['pipeline']['encoder'] = 'actionspan'
settings_pipeline['data']['pipeline']['aggregator'] = 'normagg'

# Create data
pipeline = PipelineMaker(settings_pipeline)
sequences, labels, indices, id_dictionary = pipeline.build_data()

# Create DF
data_r = []
for i, seq in enumerate(sequences):
    row = [id_dictionary['sequences'][indices[i]]['learner_id']] + list(seq) + [labels[i]]
    data_r.append(row)

columns = ['lid'] + list(sequencer.get_states()) + ['binconcepts']
df_r = pd.DataFrame(data_r, columns=columns)
df_r['permutation'] = df_r['lid'].apply(lambda x: ranks.loc[x]['ranking'])
df_r['language'] = df_r['lid'].apply(lambda x: ranks.loc[x]['language'])
df_r['field'] = df_r['lid'].apply(lambda x: ranks.loc[x]['field'])
df_r['year'] = df_r['lid'].apply(lambda x: ranks.loc[x]['year'])
df_r['gender'] = df_r['lid'].apply(lambda x: ranks.loc[x]['gender'])

# Action count
df_r.to_csv('../data/beerslaw/logistic_regression/action_span.tsv', sep='\t')