In [1]:
import os
import pickle
import yaml

import numpy as np
import pandas as pd

# Demographics

In [2]:
with open('../../data/beerslaw/features/raw/rankings.pkl', 'rb') as fp:
    ranks = pickle.load(fp)
    
demographics = ranks[[
    'username', 'language', 'field', 'level', 'year', 'gender', 'ranking'
]]
demographics = demographics.set_index('username')

In [3]:
with open('../../data/beerslaw/features/raw/labelmaps/nconcepts_binary.yaml', 'r') as fp:
    binconcepts = yaml.load(fp, Loader=yaml.FullLoader)
    
with open('../../data/beerslaw/features/raw/labelmaps/vector_binary.yaml', 'r') as fp:
    binvector = yaml.load(fp, Loader=yaml.FullLoader)

# Simple States

In [4]:
folder = '../../data/beerslaw/features/raw/simplestates/'
with open(folder + 'sequences.pkl', 'rb') as fp:
    sequences = pickle.load(fp)
    
with open(folder + 'indices.pkl', 'rb') as fp:
    indices = pickle.load(fp)
    
with open(folder + 'id_dictionary.pkl', 'rb') as fp:
    idd = pickle.load(fp)

In [5]:
full_data = {}

j = 0
for i, idx in enumerate(indices):
    lid = idd['sequences'][idx]['learner_id']
    
    if demographics.loc[lid]['field'] == 'Fast track':
        continue
    binconcepts_label = binconcepts['map'][demographics.loc[lid]['ranking']]
    binconcepts_label = binconcepts['target_index'][binconcepts_label]
    
    binvector_label = binvector['map'][demographics.loc[lid]['ranking']]
    binvector_label = binvector['target_index'][binvector_label]
    
    student = {
        'sequence': sequences[i],
        'idx': idx,
        'learner_id': lid,
        'gender': demographics.loc[lid]['gender'],
        'language': demographics.loc[lid]['language'],
        'field': demographics.loc[lid]['field'],
        'year': demographics.loc[lid]['year'],
        'ranking': demographics.loc[lid]['ranking'],
        'binconcepts': binconcepts_label,
        'binvector': binvector_label
    }
    full_data[j] = student
    j += 1
    
full_data['available_demographics'] = [
    'language', 'gender', 'field', 'year'
] 

In [6]:
with open('../../data/beerslaw/features/simplestates_sequences.pkl', 'wb') as fp:
    pickle.dump(full_data, fp)

# Simple More States

In [7]:
folder = '../../data/beerslaw/features/raw/simplemorestates/'
with open(folder + 'sequences.pkl', 'rb') as fp:
    sequences = pickle.load(fp)
    
with open(folder + 'indices.pkl', 'rb') as fp:
    indices = pickle.load(fp)
    
with open(folder + 'id_dictionary.pkl', 'rb') as fp:
    idd = pickle.load(fp)

In [8]:
full_data = {}
j=0
for i, idx in enumerate(indices):
    lid = idd['sequences'][idx]['learner_id']
    if demographics.loc[lid]['field'] == 'Fast track':
        continue
        
    binconcepts_label = binconcepts['map'][demographics.loc[lid]['ranking']]
    binconcepts_label = binconcepts['target_index'][binconcepts_label]
    
    binvector_label = binvector['map'][demographics.loc[lid]['ranking']]
    binvector_label = binvector['target_index'][binvector_label]
    
    student = {
        'sequence': sequences[i],
        'idx': idx,
        'learner_id': lid,
        'gender': demographics.loc[lid]['gender'],
        'language': demographics.loc[lid]['language'],
        'field': demographics.loc[lid]['field'],
        'year': demographics.loc[lid]['year'],
        'ranking': demographics.loc[lid]['ranking'],
        'binconcepts': binconcepts_label,
        'binvector': binvector_label
    }
    full_data[j] = student
    j+=1
    
full_data['available_demographics'] = [
    'language', 'gender', 'field', 'year'
] 

In [9]:
with open('../../data/beerslaw/features/simplemorestates_sequences.pkl', 'wb') as fp:
    pickle.dump(full_data, fp)

In [10]:
genders = [full_data[idx]['field'] for idx in full_data if idx != 'available_demographics']
np.unique(genders)

array(['Biology', 'Chemistry', 'Chemistry, Textiles', 'Pharma Chemistry'],
      dtype='<U19')