<h3 style="color:royalblue"> This notebook's purpose is to prepare the data for the logistic regression to be conducted in R.</h3>

In [1]:
cd ../../../src

/Users/cock/kDrive/PhD/Projects/Labs/beerslaw-lab/src


In [2]:
import os
import re
import dill
import pickle

import pandas as pd 

from extractors.sequencer.flat.simplemorestates_secondsflat import SimpleMoreStateSecondsFlat
from extractors.pipeline_maker import PipelineMaker

# Make the sequences

In [3]:
settings = {
    'data': {
        'pipeline': {
            'sequencer_dragasclick': True,
            'break_threshold': 0.6
        }
    }
}

In [4]:
# Where to find the simulation objects
parsed_simulation_path = '../data/beerslaw/parsed simulations/'
files = os.listdir(parsed_simulation_path)
files = [f for f in files if 'simulation' in f]
files = [f for f in files if '_t2v_' in f]

# Load demographics information
with open('../data/beerslaw/post_test/rankings.pkl', 'rb') as fp:
    ranks = pickle.load(fp)
    ranks = ranks.set_index('username')

# useful regex
id_regex = re.compile('lid([^_]+)_')

In [5]:
# Load sequencer
sequencer = SimpleMoreStateSecondsFlat(settings)
sequencer.set_rankings(ranks)

In [None]:
# Parse
id_dictionary = {'index':{}, 'sequences': {}}
i = 0
while len(files) != 0:
    file = files[0]
    lid = id_regex.findall(file)[0]
    print(file)
    print('   {}'.format(lid))
    
    
    # Retrieve demographics
    try:
        permutation = ranks.loc[lid]['ranking']
        gender = ranks.loc[lid]['gender']
        year = ranks.loc[lid]['year']
        field = ranks.loc[lid]['field']
        language = ranks.loc[lid]['language']
        print('    ranking: {}, gender: {}, year: {}, field: {}, language: {}'.format(
            permutation, gender, year, field, language
        ))
    except KeyError:
        print('    No ranking or demographics')
        files_noranking = []
        files_noranking.append('permmissing_lid' + str(lid) + '_t1v_simulation.pkl')
        files_noranking.append('permmissing_lid' + str(lid) + '_t2v_simulation.pkl')
        files_noranking.append('permmissing_lid' + str(lid) + '_t3v_simulation.pkl')
        files_noranking.append('permwrong field_lid' + str(lid) + '_t1v_simulation.pkl')
        files_noranking.append('permwrong field_lid' + str(lid) + '_t2v_simulation.pkl')
        files_noranking.append('permwrong field_lid' + str(lid) + '_t3v_simulation.pkl')
        for f in files_noranking:
            if f in files:
                files.remove(f)
                print(f)
        continue
        
    
    # Retrieve actions    
    file_path = 'perm{}_lid{}_t2v_simulation.pkl'.format(permutation, lid)
    try:
        with open(parsed_simulation_path + file_path, 'rb') as fp:
            sim = dill.load(fp)
            sim.set_permutation(permutation)
            sim.save()
        if file_path in files:
            files.remove(file_path)
        file_path1 = 'perm{}_lid{}_t1v_simulation.pkl'.format(permutation, lid)
        if file_path1 in files:
            files.remove(file_path1)
        file_path3 = 'perm{}_lid{}_t3v_simulation.pkl'.format(permutation, lid)
        if file_path3 in files:
            files.remove(file_path3)
        labels, begins, ends = sequencer.get_sequences(sim, lid)
        last_timestamp = sim.get_last_timestamp()
    except FileNotFoundError:
        labels, begins, ends = [], [], []
        last_timestamp = 0
    except TypeError:
        labels, begins, ends = [], [], []
        last_timestamp = 0

    sim_dict = {
        'sequence': labels,
        'begin': begins,
        'end': ends,
        'permutation': permutation,
        'last_timestamp': last_timestamp,
        'learner_id': str(lid),
        'gender': gender,
        'year': year,
        'field': field,
        'language': language
    }
    path = '../data/beerslaw/logistic_regression/sequenced/simplemorestates_secondsflat/p{}_lid{}_t2_sequenced.pkl'.format(
        permutation, lid,
    )
    with open(path, 'wb') as fp:
        pickle.dump(sim_dict, fp)

    id_dictionary['sequences'][i] = {
        'path': path,
        'length': len(labels),
        'learner_id': str(lid),
        'gender': gender,
        'year': year,
        'field': field,
        'language': language
    }
    id_dictionary['index'][str(lid)] = i
    i += 1
    
with open('../data/beerslaw/logistic_regression/sequenced/simplemorestates_secondsflat/id_dictionary.pkl', 'wb') as fp:
    pickle.dump(id_dictionary, fp)

# Sequence to Features

In [6]:
# settings
settings_pipeline = {
    'experiment':{
        'class_name': 'binconcepts',
        'root_name': 'logistic_regression_R',
        'name': 'logistic_regression',
    },
    'data': {
        'pipeline': {
            'sequencer': 'simplemorestates_secondsflat',
            'sequencer_dragasclick': True,
            'break_threshold': 0.6,
            'concatenator': {
                'type': 'chemconcat',
                'tasks': ['2']
            },
            'demographic_filter': 'chemlab',
            'event_filter': 'nofilt',
            'break_threshold': 0.6,
            'break_filter': 'nobrfilt',
            'adjuster': 'full'
        },
        'min_length': 0,
        'filters': {'interactionlimit':10},
        'adjuster': {'limit':300}
    },
    'ML': {
        'pipeline': {
            'scorer': 'none',
        },
    },
    'paths': {'sequenced_simulations': '../data/beerslaw/logistic_regression/sequenced/'}
}

## Action Count

In [9]:
# Parameters for action count
settings_pipeline['data']['pipeline']['encoder'] = '1hot'
settings_pipeline['data']['pipeline']['aggregator'] = 'aveagg'

# Create data
pipeline = PipelineMaker(settings_pipeline)
sequences, labels, indices, id_dictionary = pipeline.build_data()

# Create DF
data_r = []
for i, seq in enumerate(sequences):
    row = [id_dictionary['sequences'][indices[i]]['learner_id']] + list(seq) + [labels[i]]
    data_r.append(row)

columns = ['lid'] + list(sequencer.get_states()) + ['binconcepts']
df_r = pd.DataFrame(data_r, columns=columns)
df_r['permutation'] = df_r['lid'].apply(lambda x: ranks.loc[x]['ranking'])

# Demographics
language_map = {
    'Deutsch': 0, 'Français': 1
}
field_map = {
    'Chemistry': 0, 'Chemistry, Textiles': 1, 'Biology': 2, 'Fast track': 3, 'Pharma Chemistry': 4
}
year_map = {
    '1st': 0,
    '2nd': 1,
    '3rd': 2
}
gender_map = {
    1: 0,
    2: 1, 
    3: 2,
    4: 3
}
df_r['language'] = df_r['lid'].apply(lambda x: language_map[ranks.loc[x]['language']])
df_r['german'] = df_r['language'].apply(lambda x: int(x==0))
df_r['french'] = df_r['language'].apply(lambda x: int(x==1))

df_r['field'] = df_r['lid'].apply(lambda x: field_map[ranks.loc[x]['field']])
df_r['chemistry'] = df_r['field'].apply(lambda x: int(x==0))
df_r['textiles'] = df_r['field'].apply(lambda x: int(x==1))
df_r['biology'] = df_r['field'].apply(lambda x: int(x==2))
df_r['fast'] = df_r['field'].apply(lambda x: int(x==3))
df_r['pharma'] = df_r['field'].apply(lambda x: int(x==4))

df_r['year'] = df_r['lid'].apply(lambda x: year_map[ranks.loc[x]['year']])
df_r['firsty'] = df_r['year'].apply(lambda x: int(x==0))
df_r['secondy'] = df_r['year'].apply(lambda x: int(x==1))
df_r['thirdy'] = df_r['year'].apply(lambda x: int(x==2))

df_r['gender'] = df_r['lid'].apply(lambda x: gender_map[ranks.loc[x]['gender']])
df_r['male'] = df_r['gender'].apply(lambda x: int(x==0))
df_r['female'] = df_r['gender'].apply(lambda x: int(x==1))
df_r['other'] = df_r['gender'].apply(lambda x: int(x>1))

# Action count
df_r.to_csv('../data/beerslaw/logistic_regression/action_count.tsv', sep='\t')

In [12]:
df_r['french']

0      0
1      1
2      0
3      1
4      0
      ..
249    0
250    0
251    0
252    1
253    1
Name: french, Length: 254, dtype: int64

## Action Span

In [11]:
# Parameters for action span
settings_pipeline['data']['pipeline']['encoder'] = 'actionspan'
settings_pipeline['data']['pipeline']['aggregator'] = 'normagg'

# Create data
pipeline = PipelineMaker(settings_pipeline)
sequences, labels, indices, id_dictionary = pipeline.build_data()

# Create DF
data_r = []
for i, seq in enumerate(sequences):
    row = [id_dictionary['sequences'][indices[i]]['learner_id']] + list(seq) + [labels[i]]
    data_r.append(row)

columns = ['lid'] + list(sequencer.get_states()) + ['binconcepts']
df_r = pd.DataFrame(data_r, columns=columns)
df_r['permutation'] = df_r['lid'].apply(lambda x: ranks.loc[x]['ranking'])


# Demographics
language_map = {
    'Deutsch': 0, 'Français': 1
}
field_map = {
    'Chemistry': 0, 'Chemistry, Textiles': 1, 'Biology': 2, 'Fast Track': 3, 'Pharma Chemistry': 4
}
year_map = {
    '1st': 0,
    '2nd': 1,
    '3rd': 2
}
gender_map = {
    1: 0,
    2: 1, 
    3: 2,
    4: 3
}
df_r['language'] = df_r['lid'].apply(lambda x: language_map[ranks.loc[x]['language']])
df_r['german'] = df_r['language'].apply(lambda x: int(x==0))
df_r['french'] = df_r['language'].apply(lambda x: int(x==1))

df_r['field'] = df_r['lid'].apply(lambda x: field_map[ranks.loc[x]['field']])
df_r['chemistry'] = df_r['field'].apply(lambda x: int(x==0))
df_r['textiles'] = df_r['field'].apply(lambda x: int(x==1))
df_r['biology'] = df_r['field'].apply(lambda x: int(x==2))
df_r['fast'] = df_r['field'].apply(lambda x: int(x==3))
df_r['pharma'] = df_r['field'].apply(lambda x: int(x==4))

df_r['year'] = df_r['lid'].apply(lambda x: year_map[ranks.loc[x]['year']])
df_r['firsty'] = df_r['year'].apply(lambda x: int(x==0))
df_r['secondy'] = df_r['year'].apply(lambda x: int(x==1))
df_r['thirdy'] = df_r['year'].apply(lambda x: int(x==2))

df_r['gender'] = df_r['lid'].apply(lambda x: gender_map[ranks.loc[x]['gender']])
df_r['male'] = df_r['gender'].apply(lambda x: int(x==0))
df_r['female'] = df_r['gender'].apply(lambda x: int(x==1))
df_r['other'] = df_r['gender'].apply(lambda x: int(x>1))



# Action count
df_r.to_csv('../data/beerslaw/logistic_regression/action_count.tsv', sep='\t')

[30.431999999999988, 10.731000000000051, 6.547000000000139, 7.370999999999967, 4.385000000000019, 7.5359999999999445, 1.2869999999999777, 0, 2.559000000000026, 12.544999999999959, 2.905999999999949, 1.7459999999999951, 0, 3.961999999999989, 0.049999999999954525, 7.083999999999989, 5.364999999999981, 3.2660000000000196, 2.3209999999999695, 8.224999999999994, 0, 57.670999999999964, 34.206000000000074, 0, 113.02, 38.60300000000004, 71.387, 22.376999999999953, 21.665999999999997]
[0.06376558937910687, 0.022485164945688727, 0.013718234544723372, 0.015444800187742993, 0.00918809507844982, 0.015790532385677774, 0.0026967111438915992, 0.0, 0.005361992087971089, 0.02628612377631747, 0.0060890773769611386, 0.003658475258146698, 0.0, 0.008301763443744111, 0.00010476733270742786, 0.014843435698001856, 0.011241534799517193, 0.006843402172455453, 0.004863299584283161, 0.017234226230387544, 0.0, 0.12084073689151127, 0.07167342765187089, 0.0, 0.2368160788520853, 0.0808866668901704, 0.1495805115998391,

[0, 6.602000000000089, 0, 24.361999999999995, 0, 0, 0, 28.4089999999998, 0, 5.04200000000003, 0, 22.239000000000004, 0, 0, 0, 20.48100000000011, 0, 0, 0, 16.93000000000025, 0, 0, 0, 0, 168.557, 0, 26.537999999999897, 0, 373.4229999999998]
[0.0, 0.009532431491965712, 0.0, 0.03517556740491753, 0.0, 0.0, 0.0, 0.04101891036886525, 0.0, 0.007279993877990118, 0.0, 0.032110230831539334, 0.0, 0.0, 0.0, 0.029571906905020928, 0.0, 0.0, 0.0, 0.02444472359269611, 0.0, 0.0, 0.0, 0.0, 0.2433744403197884, 0.0, 0.03831742910236015, 0.0, 0.5391743661048565]

[0, 0.05000000000001137, 0, 0, 0, 14.484000000000862, 10.229000000000099, 0, 0, 18.2950000000001, 6.046000000000049, 0.9890000000000043, 0, 14.610000000000042, 0, 13.302999999999997, 0, 10.42900000000003, 3.9460000000000264, 0, 0, 0, 78.63899999999998, 22.487000000000023, 105.14000000000001, 0, 115.14499999999973, 21.54599999999988, 19.456000000000003]
[0.0, 0.00010993988487097736, 0.0, 0.0, 0.0, 0.031847385849419374, 0.022491501646899652, 0.0, 0.0

KeyError: 'Fast track'