In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.api.types import is_numeric_dtype

from networkx.drawing.nx_agraph import read_dot
import tqdm

In [36]:
import sys
sys.path.insert(0, '../milestone-04')

In [37]:
# Import the tables of the data set as dataframes.
# Choose random or uniform sampling

if 0:
    DATA_DIR = '../milestone-04/data' 

    # You can use the nrows=X argument in pd.read_csv to truncate your data
    users = pd.read_csv('{}/calcularis_small_users.csv'.format(DATA_DIR), index_col=0)
    events = pd.read_csv('{}/calcularis_small_events.csv'.format(DATA_DIR), index_col=0)
    subtasks = pd.read_csv('{}/calcularis_small_subtasks.csv'.format(DATA_DIR), index_col=0)
else:
    DATA_DIR = './data_uniform' 
    users = pd.read_csv('{}/users_uniform_sample.csv'.format(DATA_DIR),index_col=1)
    events = pd.read_csv('{}/events_uniform_sample.csv'.format(DATA_DIR), index_col=1)
    subtasks = pd.read_csv('{}/subtasks_uniform_sample.csv'.format(DATA_DIR), index_col=1)

In [8]:
# Read the DOT file and store it as a NetworkX graph
dot_file_path = '../milestone-04/data/04_calcularis_skill_map_dot_file.dot'
G = read_dot(dot_file_path)

In [10]:
# helper methods
from utils import get_nodes, get_skill_id_ranks, choose_ranking, find_user_subtasks, calculate_mastery_level, find_word_in_list, get_random_ids, convert_to_csv, load_from_csv

In [31]:
# Create dataframe with multindex [user_id, week, game]
def create_dataframe_multi_index(G, how_many=100, verbose=False):

    # Create empty dataframe
    multi_index = [[], [], []]
    df = pd.DataFrame(columns = ['mastery_level', 'mastery_level_diff'], index = multi_index)
    df.index = df.index.set_names(['user_id', 'game_name', 'week'])

    subtasks_events = subtasks.merge(events, on='event_id')

    # Create users iterator
    users_iterator = tqdm.tqdm(users.iterrows(), total=len(users), colour='#C83D95') if verbose else users.iterrows()

    for user_id, user in users_iterator:
        # Find user_subtasks 
        user_subtasks = find_user_subtasks(subtasks_events, user_id)[['week_number', 'game_name', 'skill_id', 'correct']]
        
        # Create index for the user:
        # Get unique games names
        game_names = events.loc[subtasks['event_id']]['game_name'].unique()

        # Get unique weeks
        unique_weeks = user_subtasks['week_number'].unique()

        len_weeks = len(unique_weeks)
        len_game_names = len(game_names)

        user_ids = [user_id for i in range(len_weeks * len_game_names)]

        user_unique_weeks = user_subtasks['week_number'].unique()
        user_unique_weeks = np.concatenate([user_unique_weeks]* (len_game_names))

        user_unique_games = [game for game in game_names for week in range(len_weeks)]

        tuples = list(zip(user_ids, user_unique_games, user_unique_weeks))

        # Assign index values
        index = pd.MultiIndex.from_tuples(tuples, names=['user_id', 'game_name', 'week'])
        user_df = pd.DataFrame(columns = ['mastery_level', 'mastery_level_diff'], index = index)

        # Calculate mastery level
        mastery_level = []
        for game in game_names:
            for week in unique_weeks:
                # Find info about statistics of the user for the game for the week
                associated_events = user_subtasks[(user_subtasks['game_name'] == game) & (user_subtasks['week_number'] == week)]
                
                # If the user played the game during the week
                if not associated_events.empty:
                    mean_skill = associated_events['skill_id'].mean()
                    lv = calculate_mastery_level(G, user_subtasks, week, game, mean_skill)
                    mastery_level.append(lv)
                # If the user did not play the game during the week, but we can get previous statictics
                elif week > unique_weeks[0]:
                    mastery_level.append(mastery_level[-1])
                # If the week == 1 and player did not play the game
                else: 
                    mastery_level.append(0.0)
                

        # Assign mastery levls
        mastery_level = pd.DataFrame(mastery_level, columns = ['mastery_level'])
        mastery_level.index = index
        user_df['mastery_level'] = mastery_level

        # Assign the difference of mastery lvls
        user_df['mastery_level_diff'] = user_df['mastery_level'].diff()
        user_df.loc[user_df.index.get_level_values('week') == 1, 'mastery_level_diff'] = 0.0

        # Add stats of the user to the dataframe
        df = pd.concat([df, user_df], axis=0)

        # process for chosen part of users dataframe
        # if user_id > how_many:
        #     break
    return df

df = create_dataframe_multi_index(G, len(users), True)
convert_to_csv(df, 'dataframe_uniform.csv')

100%|[38;2;200;61;149m██████████[0m| 1000/1000 [16:07<00:00,  1.03it/s]


In [33]:
df = load_from_csv('dataframe_uniform.csv')
len(df.reset_index()['user_id'].unique())

940

In [47]:
users.query('country == "CH"').index[:150]

Int64Index([  123,   257,   594,  1148,  1206,  1227,  1653,  2246,  2443,
             2517,
            ...
            38921, 39725, 39943, 40148, 40166, 40580, 40592, 41006, 41691,
            42880],
           dtype='int64', name='user_id', length=150)

In [43]:
600/4

150.0

In [44]:
200/4

50.0

In [48]:
users.country.unique()

array(['NL', 'CH', 'CA', 'DE'], dtype=object)

In [49]:
train_user_ids = []
valid_user_ids = []
test_user_ids = []
for c in users.country.unique():
    train_user_ids.extend(list(users.query(f'country == "{c}"').index[:150]))
    valid_user_ids.extend(list(users.query(f'country == "{c}"').index[150:200]))
    test_user_ids.extend(list(users.query(f'country == "{c}"').index[200:]))

In [53]:
users[users.index.isin(train_user_ids)]

Unnamed: 0_level_0,Unnamed: 0,learning_time_ms,logged_in_time_ms,language,country,start,end
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
34,32,43987892,129830922,nl,NL,2022-07-06T06:37:22.930Z,2022-12-13T16:32:34.573Z
123,121,56796419,92098216,de,CH,2019-09-02T08:55:50.353Z,2021-06-14T08:06:06.157Z
257,255,30279516,51032678,de,CH,2022-06-08T07:51:07.097Z,2023-02-06T09:09:06.504Z
356,354,14103687,34441128,fr,CA,2020-01-13T13:26:17.857Z,2020-05-13T14:12:06.474Z
535,533,31446639,137101673,nl,NL,2021-05-27T06:48:52.545Z,2022-11-16T12:58:02.148Z
...,...,...,...,...,...,...,...
40580,40538,73988135,140322376,de,CH,2019-10-03T06:44:10.079Z,2021-05-15T14:35:13.514Z
40592,40550,15302416,40350857,de,CH,2020-06-18T08:26:44.100Z,2022-03-23T13:38:01.459Z
41006,40964,90656028,145184478,de,CH,2020-11-24T08:13:37.706Z,2022-10-05T05:06:39.827Z
41691,41648,27919422,62380822,de,CH,2019-03-16T07:24:42.321Z,2023-02-05T15:58:40.098Z
