# Introduction

In this notebook we will explore the data and try to find some insights. We will also try to find some patterns in the data which will help us in feature engineering and model building for later stages.

# Setup

In [None]:
%pip install numpy pandas matplotlib seaborn

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Setup matplotlib
pd.plotting.register_matplotlib_converters()
%matplotlib inline

In [None]:
# Path to files
test_csv_path = "./data/test.csv"
train_csv_path = "./data/train.csv"
target_labels_csv = "./data/train_labels.csv"

sample_submission_csv = "./data/sample_submission.csv"

# Data Loading

In [None]:
level_group_cat_type = pd.CategoricalDtype(
    categories=["0-4", "5-12", "13-22"], ordered=True
)

In [None]:
train_df = pd.read_csv(
    train_csv_path,
    index_col="index",
    dtype={
        "session_id": "int64",
        "elapsed_time": "int32",
        "event_name": "category",
        "name": "category",
        "level": "int8",
        "page": "Int8",
        "room_coor_x": "float32",
        "room_coor_y": "float32",
        "screen_coor_x": "float32",
        "screen_coor_y": "float32",
        "hover_duration": "float32",
        "text": "str",
        "fqid": "category",
        "room_fqid": "category",
        "text_fqid": "category",
        "fullscreen": "int8",
        "hq": "int8",
        "music": "int8",
        "level_group": level_group_cat_type,
    },
)

In [None]:
target_df = pd.read_csv(target_labels_csv)

In [None]:
target_df[["session_id", "question_number"]] = target_df["session_id"].str.split(
    "_", expand=True
)

In [None]:
# Get the question number as int
target_df["question_number"] = (
    target_df["question_number"].str.extract("(\d+)").astype("int8")
)

Its not necessary, but we are going to convert the correct column to a boolean type.

In [None]:
target_df["correct"] = target_df["correct"].astype("bool")

# EDA

In [None]:
# Check number of nan values in each column
train_df.isna().sum()

In [None]:
# Plot frequency of nan values in each column
plt.figure(figsize=(12, 8))

sns.barplot(
    x=train_df.isna().sum().index,
    y=train_df.isna().sum().values,
    order=train_df.isna().sum().sort_values(ascending=False).index,
    palette="mako",
)

plt.xticks(rotation=90)

plt.show()

As we can see, some columns like `page`, `hover_duration`, `text` and `text_fquid` have a lot of missing values. We'll see later how to deal with this and if this is a problem or not.

In [None]:
# Plot number of correct answers per question
plt.figure(figsize=(12, 8))

sns.countplot(x="question_number", hue="correct", data=target_df, palette="mako")

plt.xlabel("Question number")
plt.ylabel("Answers")

plt.title("Number of correct answers per question number")

plt.show()

In [None]:
# Plot number of events by session
plt.figure(figsize=(12, 8))

sns.histplot(
    x=train_df["session_id"].value_counts(),
    data=train_df,
    kde=True,
    color=sns.color_palette("mako")[0],
)

plt.xlabel("Number of events per session")
plt.ylabel("Events")
plt.xlim(500, 2500)

plt.title("Number of events by session")
plt.show()

In [None]:
# Average number of events per session
train_df["session_id"].value_counts().mean()

In [None]:
train_df["elapsed_time"].describe()

In [None]:
# Create a new Series with the elapsed time in hours
elapsed_time_hours = train_df["elapsed_time"] / 3600  # group by user_id

# Plot the elapsed time in hours
plt.figure(figsize=(12, 8))

sns.histplot(
    x=elapsed_time_hours,
    data=train_df,
    kde=True,
    color=sns.color_palette("mako")[0],
)

plt.show()

In [None]:
# Plot frequency of events
plt.figure(figsize=(12, 8))

sns.countplot(
    y="event_name",
    data=train_df,
    order=train_df["event_name"].value_counts().index,
    palette="mako",
)

plt.show()

In [None]:
# Number of events per level_group
counts = train_df["level_group"].value_counts().sort_index()
counts

In [None]:
# Plot number of events per level_group
plt.figure(figsize=(12, 8))

sns.barplot(x=counts.index, y=counts.values, palette="mako")

plt.yticks(counts.values, counts.values)

plt.show()

In [None]:
# Plot number of events per level
plt.figure(figsize=(12, 8))

sns.barplot(
    x=counts.index,
    y=counts.values,
    palette="mako",
)