In [8]:
# Import the tables of the data set as dataframes.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns

BASE_DATA_DIR = "./data"
ORIGINAL_DATA_DIR = f"{BASE_DATA_DIR}/original/data"
KAGGLE_PATH = "/kaggle/input/filess"

mastery_with_dates = pd.read_csv("{}/lernnavi/mastery_per_topic_with_dates.csv".format(BASE_DATA_DIR), parse_dates=["event_date"])
qna = pd.read_pickle("{}/lernnavi/qna/MULTIPLE_CHOICE_german.pkl".format(BASE_DATA_DIR))

In [9]:
display(mastery_with_dates.head(2))
display(qna.head(2))

Unnamed: 0,user_id,event_date,Rechtschreibprinzipien,Gross- und Kleinschreibung,Getrennt- und Zusammenschreibung,Rechtschreiblich schwierige Wörter,Komma bei Aufzählungen,Komma in Satzgefügen,Komma bei direkter Rede und Zitaten,Komma bei Zusätzen,...,Beziehung zwischen Satzgliedern/Attributen und Nebensätzen,Konnektoren,Wortschatz,Idiomatismen,Rhetorik,Stilistische Angemessenheit,Mikrostruktur,Textsorten/Gattungen,Makrostruktur,Schreibprozess
0,432020,2023-02-03 19:50:47.518,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,431999,2023-02-03 14:38:18.669,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,multiple_responses,question,choices,correct,student_answer,start_time,user_id
35299,True,Markiere die Sätze mit der korrekten Kommasetz...,[<table><tbody><tr><td><p>Im „hessischen Landb...,"[False, False, False, True]","[False, False, False, False]",2021-05-21 11:16:29.867,393224
35300,True,Markiere die Sätze mit der korrekten Kommasetz...,[<table><tbody><tr><td><p>Im „hessischen Landb...,"[False, False, False, True]","[False, True, False, True]",2021-05-21 11:16:54.135,393232


In [10]:
mastery_with_rolling_median = (
    mastery_with_dates
        .drop("event_date", axis=1)
        .groupby("user_id")
        .expanding()
        .median()
        .reset_index(level="user_id")
        .sort_index()
)

mastery_with_rolling_median["event_date"] = mastery_with_dates["event_date"]
mastery_with_rolling_median.head(2)

Unnamed: 0,user_id,Rechtschreibprinzipien,Gross- und Kleinschreibung,Getrennt- und Zusammenschreibung,Rechtschreiblich schwierige Wörter,Komma bei Aufzählungen,Komma in Satzgefügen,Komma bei direkter Rede und Zitaten,Komma bei Zusätzen,Verben,...,Konnektoren,Wortschatz,Idiomatismen,Rhetorik,Stilistische Angemessenheit,Mikrostruktur,Textsorten/Gattungen,Makrostruktur,Schreibprozess,event_date
0,432020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-02-03 19:50:47.518
1,431999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-02-03 14:38:18.669


In [17]:
def get_mastery_for_closest_preceding_date(row):

    try:
        return (
            mastery_with_rolling_median[(mastery_with_rolling_median["user_id"] == row["user_id"]) & (mastery_with_rolling_median["event_date"] <= row["start_time"])]
                .rename(columns={"event_date": "start_time"})
                .sort_values("start_time", ascending=False)
                .iloc[0]
        )
    except:
        empty = pd.Series(index=mastery_with_rolling_median.columns.drop("event_date"), dtype=np.float64)
        empty.loc["user_id"] = row["user_id"]
        empty.loc["start_time"] = row["start_time"]
        return empty.fillna(0)

        
rolling_median_synced_with_time_of_questions = qna.sort_values("start_time", ascending=True).apply(get_mastery_for_closest_preceding_date, axis=1)
rolling_median_synced_with_time_of_questions.head(2)

Unnamed: 0,user_id,Rechtschreibprinzipien,Gross- und Kleinschreibung,Getrennt- und Zusammenschreibung,Rechtschreiblich schwierige Wörter,Komma bei Aufzählungen,Komma in Satzgefügen,Komma bei direkter Rede und Zitaten,Komma bei Zusätzen,Verben,...,Konnektoren,Wortschatz,Idiomatismen,Rhetorik,Stilistische Angemessenheit,Mikrostruktur,Textsorten/Gattungen,Makrostruktur,Schreibprozess,start_time
35299,393224.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021-05-21 11:16:29.867
35452,388363.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021-05-21 11:16:46.783


In [30]:
questions_merged_with_rolling_median = (
    qna
        .merge(rolling_median_synced_with_time_of_questions, left_index=True, right_index=True, suffixes=("", "_rolling_median"))
        .drop(["user_id_rolling_median", "start_time_rolling_median"], axis=1)
        .set_index("user_id")
)
questions_merged_with_rolling_median.head(2)

Unnamed: 0_level_0,multiple_responses,question,choices,correct,student_answer,start_time,Rechtschreibprinzipien,Gross- und Kleinschreibung,Getrennt- und Zusammenschreibung,Rechtschreiblich schwierige Wörter,...,Beziehung zwischen Satzgliedern/Attributen und Nebensätzen,Konnektoren,Wortschatz,Idiomatismen,Rhetorik,Stilistische Angemessenheit,Mikrostruktur,Textsorten/Gattungen,Makrostruktur,Schreibprozess
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
393224,True,Markiere die Sätze mit der korrekten Kommasetz...,[<table><tbody><tr><td><p>Im „hessischen Landb...,"[False, False, False, True]","[False, False, False, False]",2021-05-21 11:16:29.867,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
393232,True,Markiere die Sätze mit der korrekten Kommasetz...,[<table><tbody><tr><td><p>Im „hessischen Landb...,"[False, False, False, True]","[False, True, False, True]",2021-05-21 11:16:54.135,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
questions_merged_with_rolling_median.to_pickle("{}/lernnavi/qna/MULTIPLE_CHOICE_german_with_mastery.pkl".format(BASE_DATA_DIR))