# Requirements

In [None]:
import numpy as np
import pandas as pd

# Seed for reproducibility
np.random.seed(42)
SEED = 42

# Preprocessing
from sklearn.preprocessing import StandardScaler

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

# Evaluation
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, log_loss

# Hyperparameter Tuning
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# Data Preprocessing and Cleaning

## Preprocessing

The data's first two rows are headers.
The first row contains the configuration numbers, but spaced one apart.
The second row contains the completion time and mistakes under each configuration number column.

In [None]:
df = pd.read_csv('data.csv')
df.head()

In [None]:
# Convert df. We can make each row contain the config num, completion time, and mistakes.
# Therefore, the data will contain 3 columns: config_num, completion_time, mistakes
# and the data will multiply the number of rows by the 7 (because there are 7 configs)

converted_df:pd.DataFrame = pd.DataFrame(columns=['nick', 'config_num', 'completion_time', 'mistakes'])

# For each participant
num_participants = df.shape[0] - 1 # -1 because the first two rows are headers

for i in range(num_participants):
    # For each config
    for j in range(1, 8):
        config_num = j-1
        
        # Mistakes and completion time are in the same row
        # But they are in different columns. Each config has 2 columns for mistakes and completion time
        mistakes_col = j * 2
        completion_time_col = j * 2 - 1

        mistakes = df.iloc[i+1, mistakes_col]
        completion_time = df.iloc[i+1, completion_time_col]

        # Add row to converted_df
        converted_df = pd.concat([converted_df, pd.DataFrame([[df.iloc[i+1, 0], config_num, completion_time, mistakes]], columns=['nick', 'config_num', 'completion_time', 'mistakes'])])

converted_df.head(14)

With our current data, the data uses configurations. However, that does not tell us enough. We can add three new variables (`size`, `color`,  and `position`) to give us more insights about our data.
- The baseline by default has `size: regular`, `color: yellow`, and `position: top`.
- Config 1 and 2 changes the size into `small` and `large`
- Config 3 and 4 changes the color into `blue` and `black`
- Config 5 and 6 changes the position into `sticky` and `bottom`

## Cleaning

The data currently has outliers so we have to remove them from our dataset.

# Model Training