### **Import Libraries and Modules**

In [4]:
import os
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import scipy.stats as stats
from pandas.tseries.holiday import USFederalHolidayCalendar
from category_encoders import TargetEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score
from zipfile import ZipFile


 ### **Loading the test and training data**

In [34]:
# Load training data from a CSV file
train = pd.read_csv("train_data.csv")

# Load testing data from a CSV file
test = pd.read_csv('test_data.csv')

# Print the count of missing values in each column of the training data
print("Missing values in training data:")
print(train.isna().sum())

print()

# Print the count of missing values in each column of the testing data
print("Missing values in testing data:")
print(test.isna().sum())

Missing values in training data:
UserID                        0
QuestionTiming                0
TimeUtc                       0
CurrentGameMode           17013
CurrentTask               17013
CurrentSessionLength          0
LastTaskCompleted         62605
LevelProgressionAmount    17018
QuestionType                  0
ResponseValue                 0
dtype: int64

Missing values in testing data:
UserID                        0
QuestionTiming                0
TimeUtc                       0
CurrentGameMode           10333
CurrentTask               10333
CurrentSessionLength          0
LastTaskCompleted         36801
LevelProgressionAmount    10335
QuestionType                  0
dtype: int64



# **Preprocessing and EDA (Exploratory Data Analysis)**


### **Removing negative values in CurrentSessionLength**

In [56]:
# Remove rows where 'CurrentSessionLength' is less than zero
# This might be necessary if negative values are errors and not valid data points
train = train.drop(train[train['CurrentSessionLength'] < 0].index)

In [55]:
train.head()

Unnamed: 0,UserID,QuestionTiming,TimeUtc,CurrentGameMode,CurrentTask,...,Hour_cos,DayOfWeek_sin,DayOfWeek_cos,UserID_encoded,UserID_targetencoded
0,p1,User Initiated,2022-08-18 22:55:27,Career,RECREATIONGROUND_PLAYGROUND,...,0.87,0.43,-0.9,0.0,726.08
1,p1,System Initiated,2022-08-18 23:38:31,Career,RECREATIONGROUND_PLAYGROUND,...,0.97,0.43,-0.9,0.0,726.08
2,p1,User Initiated,2022-08-18 23:39:24,Career,HOME_VAN,...,0.97,0.43,-0.9,0.0,726.08
3,p1,System Initiated,2022-08-18 23:45:01,Career,RESIDENTIALSMALL_BACKYARD,...,0.97,0.43,-0.9,0.0,726.08
4,p1,System Initiated,2022-08-18 23:51:22,Career,RESIDENTIALSMALL_BACKYARD,...,0.97,0.43,-0.9,0.0,726.08


### **Removing outliers using IQR filtering in CurrentSessionLength**

In [58]:
# Calculate the first quartile (Q1)
Q1 = train['CurrentSessionLength'].quantile(0.25)

# Calculate the third quartile (Q3)
Q3 = train['CurrentSessionLength'].quantile(0.75)

# The Interquartile Range (IQR) is the difference between the third and first quartiles
IQR = Q3 - Q1

# Define lower and upper bounds for filtering outliers
# Outliers are defined as points below Q1 - 1.5*IQR or above Q3 + 1.5*IQR
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter the data to include only values within the lower and upper bounds
train = train[(train['CurrentSessionLength'] >= lower_bound) & (train['CurrentSessionLength'] <= upper_bound)]

In [57]:
train.head()

Unnamed: 0,UserID,QuestionTiming,TimeUtc,CurrentGameMode,CurrentTask,...,Hour_cos,DayOfWeek_sin,DayOfWeek_cos,UserID_encoded,UserID_targetencoded
0,p1,User Initiated,2022-08-18 22:55:27,Career,RECREATIONGROUND_PLAYGROUND,...,0.87,0.43,-0.9,0.0,726.08
1,p1,System Initiated,2022-08-18 23:38:31,Career,RECREATIONGROUND_PLAYGROUND,...,0.97,0.43,-0.9,0.0,726.08
2,p1,User Initiated,2022-08-18 23:39:24,Career,HOME_VAN,...,0.97,0.43,-0.9,0.0,726.08
3,p1,System Initiated,2022-08-18 23:45:01,Career,RESIDENTIALSMALL_BACKYARD,...,0.97,0.43,-0.9,0.0,726.08
4,p1,System Initiated,2022-08-18 23:51:22,Career,RESIDENTIALSMALL_BACKYARD,...,0.97,0.43,-0.9,0.0,726.08


### **Median Imputation for LevelProgressionAmount**

In [59]:
# Calculate the median of the 'LevelProgressionAmount' in the training data
median_value = train['LevelProgressionAmount'].median()

# Fill missing values in the 'LevelProgressionAmount' column of the training data with the calculated median
train['LevelProgressionAmount'].fillna(median_value, inplace=True)

# Calculate the median of the 'LevelProgressionAmount' for the testing data
median_value = test['LevelProgressionAmount'].median()

# Fill missing values in the 'LevelProgressionAmount' column of the testing data with its median
test['LevelProgressionAmount'].fillna(median_value, inplace=True)

In [60]:
train.head()

Unnamed: 0,UserID,QuestionTiming,TimeUtc,CurrentGameMode,CurrentTask,...,Hour_cos,DayOfWeek_sin,DayOfWeek_cos,UserID_encoded,UserID_targetencoded
0,p1,User Initiated,2022-08-18 22:55:27,Career,RECREATIONGROUND_PLAYGROUND,...,0.87,0.43,-0.9,0.0,726.08
1,p1,System Initiated,2022-08-18 23:38:31,Career,RECREATIONGROUND_PLAYGROUND,...,0.97,0.43,-0.9,0.0,726.08
2,p1,User Initiated,2022-08-18 23:39:24,Career,HOME_VAN,...,0.97,0.43,-0.9,0.0,726.08
3,p1,System Initiated,2022-08-18 23:45:01,Career,RESIDENTIALSMALL_BACKYARD,...,0.97,0.43,-0.9,0.0,726.08
4,p1,System Initiated,2022-08-18 23:51:22,Career,RESIDENTIALSMALL_BACKYARD,...,0.97,0.43,-0.9,0.0,726.08


In [61]:
test.head()

Unnamed: 0,UserID,QuestionTiming,TimeUtc,CurrentGameMode,CurrentTask,...,Hour_cos,DayOfWeek_sin,DayOfWeek_cos,UserID_encoded,UserID_targetencoded
0,p1,System Initiated,2022-08-28 15:50:22,,,...,-0.71,-0.78,0.62,0.0,726.08
1,p1,User Initiated,2022-08-28 16:05:02,Career,RECREATIONGROUND_SKATEPARK,...,-0.5,-0.78,0.62,0.0,726.08
2,p1,User Initiated,2022-09-07 03:31:50,,,...,0.71,0.97,-0.22,0.0,726.08
3,p1,System Initiated,2022-09-08 01:30:05,,,...,0.97,0.43,-0.9,0.0,726.08
4,p1,System Initiated,2022-09-08 01:43:45,Career,RECREATIONGROUND_SKATEPARK,...,0.97,0.43,-0.9,0.0,726.08


### **Feature-Engineering TimeUtc**

In [62]:

# Convert the 'TimeUtc' column to datetime format to enable datetime operations
train['TimeUtc'] = pd.to_datetime(train['TimeUtc'])

# Extracting various time components from the 'TimeUtc' column
train['Second'] = train['TimeUtc'].dt.second  # Extract seconds from 'TimeUtc'
train['Minute'] = train['TimeUtc'].dt.minute  # Extract minutes from 'TimeUtc'
train['Hour'] = train['TimeUtc'].dt.hour      # Extract hours from 'TimeUtc'
train['Day'] = train['TimeUtc'].dt.day        # Extract day of month from 'TimeUtc'
train['DayOfWeek'] = train['TimeUtc'].dt.dayofweek  # Extract day of the week (Monday=0, Sunday=6)

# Determine if the day is a weekend (Saturday or Sunday)
train['IsWeekend'] = train['DayOfWeek'] >= 5  # Boolean value, True if day is a weekend

train['Month'] = train['TimeUtc'].dt.month    # Extract month from 'TimeUtc'
train['Quarter'] = train['TimeUtc'].dt.quarter  # Extract quarter from 'TimeUtc'

# Calculate the elapsed time in seconds from the earliest timestamp in the data
train['ElapsedTime'] = (train['TimeUtc'] - train['TimeUtc'].min()).dt.total_seconds()

# Creating cyclic features for 'Hour' to capture the cyclical nature of time
train['Hour_sin'] = np.sin(2 * np.pi * train['Hour'] / 24)  # Sine transformation for hour
train['Hour_cos'] = np.cos(2 * np.pi * train['Hour'] / 24)  # Cosine transformation for hour

# Creating cyclic features for 'DayOfWeek' to capture the cyclical nature of days in a week
train['DayOfWeek_sin'] = np.sin(2 * np.pi * train['DayOfWeek'] / 7)  # Sine transformation for day of the week
train['DayOfWeek_cos'] = np.cos(2 * np.pi * train['DayOfWeek'] / 7)  # Cosine transformation for day of the week

In [63]:
train.head()

Unnamed: 0,UserID,QuestionTiming,TimeUtc,CurrentGameMode,CurrentTask,...,Hour_cos,DayOfWeek_sin,DayOfWeek_cos,UserID_encoded,UserID_targetencoded
0,p1,User Initiated,2022-08-18 22:55:27,Career,RECREATIONGROUND_PLAYGROUND,...,0.87,0.43,-0.9,0.0,726.08
1,p1,System Initiated,2022-08-18 23:38:31,Career,RECREATIONGROUND_PLAYGROUND,...,0.97,0.43,-0.9,0.0,726.08
2,p1,User Initiated,2022-08-18 23:39:24,Career,HOME_VAN,...,0.97,0.43,-0.9,0.0,726.08
3,p1,System Initiated,2022-08-18 23:45:01,Career,RESIDENTIALSMALL_BACKYARD,...,0.97,0.43,-0.9,0.0,726.08
4,p1,System Initiated,2022-08-18 23:51:22,Career,RESIDENTIALSMALL_BACKYARD,...,0.97,0.43,-0.9,0.0,726.08


In [64]:
# Convert the 'TimeUtc' column to datetime format to enable datetime operations
test['TimeUtc'] = pd.to_datetime(test['TimeUtc'])

# Extracting various time components from the 'TimeUtc' column
test['Second'] = test['TimeUtc'].dt.second  # Extract seconds from 'TimeUtc'
test['Minute'] = test['TimeUtc'].dt.minute  # Extract minutes from 'TimeUtc'
test['Hour'] = test['TimeUtc'].dt.hour      # Extract hours from 'TimeUtc'
test['Day'] = test['TimeUtc'].dt.day        # Extract day of month from 'TimeUtc'
test['DayOfWeek'] = test['TimeUtc'].dt.dayofweek  # Extract day of the week (Monday=0, Sunday=6)

# Determine if the day is a weekend (Saturday or Sunday)
test['IsWeekend'] = test['DayOfWeek'] >= 5  # Boolean value, True if day is a weekend

test['Month'] = test['TimeUtc'].dt.month    # Extract month from 'TimeUtc'
test['Quarter'] = test['TimeUtc'].dt.quarter  # Extract quarter from 'TimeUtc'

# Calculate the elapsed time in seconds from the earliest timestamp in the data
test['ElapsedTime'] = (test['TimeUtc'] - test['TimeUtc'].min()).dt.total_seconds()

# Creating cyclic features for 'Hour' to capture the cyclical nature of time
test['Hour_sin'] = np.sin(2 * np.pi * test['Hour'] / 24)  # Sine transformation for hour
test['Hour_cos'] = np.cos(2 * np.pi * test['Hour'] / 24)  # Cosine transformation for hour

# Creating cyclic features for 'DayOfWeek' to capture the cyclical nature of days in a week
test['DayOfWeek_sin'] = np.sin(2 * np.pi * test['DayOfWeek'] / 7)  # Sine transformation for day of the week
test['DayOfWeek_cos'] = np.cos(2 * np.pi * test['DayOfWeek'] / 7)  # Cosine transformation for day of the week

In [65]:
test.head()

Unnamed: 0,UserID,QuestionTiming,TimeUtc,CurrentGameMode,CurrentTask,...,Hour_cos,DayOfWeek_sin,DayOfWeek_cos,UserID_encoded,UserID_targetencoded
0,p1,System Initiated,2022-08-28 15:50:22,,,...,-0.71,-0.78,0.62,0.0,726.08
1,p1,User Initiated,2022-08-28 16:05:02,Career,RECREATIONGROUND_SKATEPARK,...,-0.5,-0.78,0.62,0.0,726.08
2,p1,User Initiated,2022-09-07 03:31:50,,,...,0.71,0.97,-0.22,0.0,726.08
3,p1,System Initiated,2022-09-08 01:30:05,,,...,0.97,0.43,-0.9,0.0,726.08
4,p1,System Initiated,2022-09-08 01:43:45,Career,RECREATIONGROUND_SKATEPARK,...,0.97,0.43,-0.9,0.0,726.08


### **Frequency Encoding UserID**

In [66]:
# Frequency encoding for 'UserID' based on its occurrence in the training data
# Calculate the frequency of each user ID and normalize it to create a probability distribution
frequency_encoding = train['UserID'].value_counts(normalize=True).to_dict()

# Map the frequency values to the 'UserID' in the training dataset
train['UserID_encoded'] = train['UserID'].map(frequency_encoding)

# Apply the same mapping to the 'UserID' in the testing dataset
test['UserID_encoded'] = test['UserID'].map(frequency_encoding)

# Fill missing values in the testing data 'UserID_encoded' column with 0
# Missing values might occur if some UserIDs in the test data were not present in the training data
test['UserID_encoded'].fillna(0, inplace=True)

In [68]:
train.head()

Unnamed: 0,UserID,QuestionTiming,TimeUtc,CurrentGameMode,CurrentTask,...,Hour_cos,DayOfWeek_sin,DayOfWeek_cos,UserID_encoded,UserID_targetencoded
0,p1,User Initiated,2022-08-18 22:55:27,Career,RECREATIONGROUND_PLAYGROUND,...,0.87,0.43,-0.9,0.0,726.08
1,p1,System Initiated,2022-08-18 23:38:31,Career,RECREATIONGROUND_PLAYGROUND,...,0.97,0.43,-0.9,0.0,726.08
2,p1,User Initiated,2022-08-18 23:39:24,Career,HOME_VAN,...,0.97,0.43,-0.9,0.0,726.08
3,p1,System Initiated,2022-08-18 23:45:01,Career,RESIDENTIALSMALL_BACKYARD,...,0.97,0.43,-0.9,0.0,726.08
4,p1,System Initiated,2022-08-18 23:51:22,Career,RESIDENTIALSMALL_BACKYARD,...,0.97,0.43,-0.9,0.0,726.08


In [70]:
test.head()

Unnamed: 0,UserID,QuestionTiming,TimeUtc,CurrentGameMode,CurrentTask,...,Hour_cos,DayOfWeek_sin,DayOfWeek_cos,UserID_encoded,UserID_targetencoded
0,p1,System Initiated,2022-08-28 15:50:22,,,...,-0.71,-0.78,0.62,0.0,726.08
1,p1,User Initiated,2022-08-28 16:05:02,Career,RECREATIONGROUND_SKATEPARK,...,-0.5,-0.78,0.62,0.0,726.08
2,p1,User Initiated,2022-09-07 03:31:50,,,...,0.71,0.97,-0.22,0.0,726.08
3,p1,System Initiated,2022-09-08 01:30:05,,,...,0.97,0.43,-0.9,0.0,726.08
4,p1,System Initiated,2022-09-08 01:43:45,Career,RECREATIONGROUND_SKATEPARK,...,0.97,0.43,-0.9,0.0,726.08


 ### **Target Encoding UserID**

In [71]:
# Prepare target  for the 'UserID' based on the 'ResponseValue'
encoder = TargetEncoder(smoothing=0.7)  # Initialize the TargetEncoder with smoothing to balance categorical average vs prior

# Apply the encoder to the 'UserID' column of the training data
# The encoder learns the mean target value for each category in 'UserID'
train['UserID_targetencoded'] = encoder.fit_transform(train['UserID'], train['ResponseValue'])

# Transform the 'UserID' column of the testing data using the learned encoding
# This ensures consistency in encoding between training and testing datasets
test['UserID_targetencoded'] = encoder.transform(test['UserID'])

# Calculate the global mean of the 'ResponseValue' from the training data
# This is used to fill in missing values for any 'UserID' that appears in the test set but not in the training set
global_mean = train['ResponseValue'].mean()

# Fill missing values in the 'UserID_targetencoded' column of the testing data with the global mean
# Missing values may occur if 'UserID' in the test data wasn't present in the training data
test['UserID_targetencoded'].fillna(global_mean, inplace=True)

In [72]:
train.head()

Unnamed: 0,UserID,QuestionTiming,TimeUtc,CurrentGameMode,CurrentTask,...,Hour_cos,DayOfWeek_sin,DayOfWeek_cos,UserID_encoded,UserID_targetencoded
0,p1,User Initiated,2022-08-18 22:55:27,Career,RECREATIONGROUND_PLAYGROUND,...,0.87,0.43,-0.9,0.0,726.05
1,p1,System Initiated,2022-08-18 23:38:31,Career,RECREATIONGROUND_PLAYGROUND,...,0.97,0.43,-0.9,0.0,726.05
2,p1,User Initiated,2022-08-18 23:39:24,Career,HOME_VAN,...,0.97,0.43,-0.9,0.0,726.05
3,p1,System Initiated,2022-08-18 23:45:01,Career,RESIDENTIALSMALL_BACKYARD,...,0.97,0.43,-0.9,0.0,726.05
4,p1,System Initiated,2022-08-18 23:51:22,Career,RESIDENTIALSMALL_BACKYARD,...,0.97,0.43,-0.9,0.0,726.05


In [73]:
test.head()

Unnamed: 0,UserID,QuestionTiming,TimeUtc,CurrentGameMode,CurrentTask,...,Hour_cos,DayOfWeek_sin,DayOfWeek_cos,UserID_encoded,UserID_targetencoded
0,p1,System Initiated,2022-08-28 15:50:22,,,...,-0.71,-0.78,0.62,0.0,726.05
1,p1,User Initiated,2022-08-28 16:05:02,Career,RECREATIONGROUND_SKATEPARK,...,-0.5,-0.78,0.62,0.0,726.05
2,p1,User Initiated,2022-09-07 03:31:50,,,...,0.71,0.97,-0.22,0.0,726.05
3,p1,System Initiated,2022-09-08 01:30:05,,,...,0.97,0.43,-0.9,0.0,726.05
4,p1,System Initiated,2022-09-08 01:43:45,Career,RECREATIONGROUND_SKATEPARK,...,0.97,0.43,-0.9,0.0,726.05


 ### **Aggregated summary statistcs with UserID and ResponseValue**

In [74]:
# Calculate aggregated features: mean, minimum, and maximum of 'ResponseValue' for each 'UserID'
# This provides a statistical summary of the 'ResponseValue' which can be useful features for models
agg_features = train.groupby('UserID')['ResponseValue'].agg(['mean', 'min', 'max']).reset_index()

# Rename the columns for clarity
agg_features.columns = ['UserID', 'UserID_mean', 'UserID_min', 'UserID_max']

# Merge the aggregated features back into the training data
# This enriches the training data with historical summary statistics which may be relevant for prediction
train = train.merge(agg_features, on='UserID', how='left')

# Merge the same aggregated features into the testing data
# Ensures that the test data has the same feature enhancements as the training data
test = test.merge(agg_features, on='UserID', how='left')

# Fill missing values in the test data
# Missing values may occur if some 'UserID's in the test data do not have corresponding records in the training data
for col in ['UserID_mean', 'UserID_min', 'UserID_max']:
    test[col].fillna(train[col].mean(), inplace=True)  # Use the mean of the train data as a neutral imputation value


In [75]:
train.head()

Unnamed: 0,UserID,QuestionTiming,TimeUtc,CurrentGameMode,CurrentTask,...,UserID_encoded,UserID_targetencoded,UserID_mean,UserID_min,UserID_max
0,p1,User Initiated,2022-08-18 22:55:27,Career,RECREATIONGROUND_PLAYGROUND,...,0.0,726.05,724.32,486.0,876.0
1,p1,System Initiated,2022-08-18 23:38:31,Career,RECREATIONGROUND_PLAYGROUND,...,0.0,726.05,724.32,486.0,876.0
2,p1,User Initiated,2022-08-18 23:39:24,Career,HOME_VAN,...,0.0,726.05,724.32,486.0,876.0
3,p1,System Initiated,2022-08-18 23:45:01,Career,RESIDENTIALSMALL_BACKYARD,...,0.0,726.05,724.32,486.0,876.0
4,p1,System Initiated,2022-08-18 23:51:22,Career,RESIDENTIALSMALL_BACKYARD,...,0.0,726.05,724.32,486.0,876.0


In [76]:
test.head()

Unnamed: 0,UserID,QuestionTiming,TimeUtc,CurrentGameMode,CurrentTask,...,UserID_encoded,UserID_targetencoded,UserID_mean,UserID_min,UserID_max
0,p1,System Initiated,2022-08-28 15:50:22,,,...,0.0,726.05,724.32,486.0,876.0
1,p1,User Initiated,2022-08-28 16:05:02,Career,RECREATIONGROUND_SKATEPARK,...,0.0,726.05,724.32,486.0,876.0
2,p1,User Initiated,2022-09-07 03:31:50,,,...,0.0,726.05,724.32,486.0,876.0
3,p1,System Initiated,2022-09-08 01:30:05,,,...,0.0,726.05,724.32,486.0,876.0
4,p1,System Initiated,2022-09-08 01:43:45,Career,RECREATIONGROUND_SKATEPARK,...,0.0,726.05,724.32,486.0,876.0


 ### **Dropping features with low importance**

In [79]:
# List of columns to be dropped from the dataset
columns_to_drop = [
    'TimeUtc', 'QuestionTiming', 'CurrentGameMode',
    'CurrentTask', 'LastTaskCompleted',
    'LastTaskCompleted_Aggregated', 'QuestionType', 'UserID',
]

# Removing specified columns from the training dataset
# This is done to clean up the data by eliminating irrelevant or redundant features
for col in columns_to_drop:
    if col in train.columns:
        train.drop(columns=col, inplace=True)

# Repeat the process for the testing dataset
# Ensuring that both datasets have the same structure for consistent model training and evaluation
for col in columns_to_drop:
    if col in test.columns:
        test.drop(columns=col, inplace=True)


In [80]:
train.head()

Unnamed: 0,CurrentSessionLength,LevelProgressionAmount,ResponseValue,Second,Minute,...,UserID_encoded,UserID_targetencoded,UserID_mean,UserID_min,UserID_max
0,2,0.54,509.0,27,55,...,0.0,726.05,724.32,486.0,876.0
1,0,0.54,653.0,31,38,...,0.0,726.05,724.32,486.0,876.0
2,1,1.0,705.0,24,39,...,0.0,726.05,724.32,486.0,876.0
3,6,0.17,817.0,1,45,...,0.0,726.05,724.32,486.0,876.0
4,13,0.43,810.0,22,51,...,0.0,726.05,724.32,486.0,876.0


In [81]:
test.head()

Unnamed: 0,CurrentSessionLength,LevelProgressionAmount,Second,Minute,Hour,...,UserID_encoded,UserID_targetencoded,UserID_mean,UserID_min,UserID_max
0,0,0.57,22,50,15,...,0.0,726.05,724.32,486.0,876.0
1,14,0.56,2,5,16,...,0.0,726.05,724.32,486.0,876.0
2,0,0.57,50,31,3,...,0.0,726.05,724.32,486.0,876.0
3,0,0.57,5,30,1,...,0.0,726.05,724.32,486.0,876.0
4,13,0.83,45,43,1,...,0.0,726.05,724.32,486.0,876.0


 ### **Random Forest Model Training and Validation**

In [85]:
# Setting the proportion of data to be used for training
sample_size = 0.8

# Preparing feature matrix X by dropping the target variable 'ResponseValue'
X = train.drop(columns=['ResponseValue'])

# Preparing target vector y which is the 'ResponseValue'
y = train['ResponseValue']

# Splitting the data into training and validation sets with 80% training and 20% validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=(1 - sample_size), random_state=42)

# Initialize RandomForestRegressor with manually chosen hyperparameters
rf = RandomForestRegressor(
    n_estimators=300,         # Number of trees in the forest
    max_depth=20,             # Maximum depth of each tree
    min_samples_split=10,     # Minimum number of samples required to split an internal node
    min_samples_leaf=3,       # Minimum number of samples required to be at a leaf node
    random_state=42           # Seed used by the random number generator
)

# Train the RandomForest model on the training data
rf.fit(X_train, y_train)

# Predicting the 'ResponseValue' using the validation set
val_predictions = rf.predict(X_val)

# Calculate the Mean Absolute Error (MAE) to evaluate model performance on the validation set
val_mae = mean_absolute_error(y_val, val_predictions)
print("Mean Absolute Error on Validation Set:", val_mae)

# Calculate feature importances to understand which features have the most impact on predictions
feature_importances = rf.feature_importances_
feature_importances_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

# Sorting the features by their importance in descending order to identify the most significant features
feature_importances_df = feature_importances_df.sort_values(by='Importance', ascending=False)
print(feature_importances_df.head(30))


Mean Absolute Error on Validation Set: 84.03088860309724
                   Feature  Importance
17             UserID_mean        0.72
0     CurrentSessionLength        0.04
18              UserID_min        0.04
10             ElapsedTime        0.03
3                   Minute        0.02
2                   Second        0.02
19              UserID_max        0.02
1   LevelProgressionAmount        0.02
5                      Day        0.02
15          UserID_encoded        0.01
16    UserID_targetencoded        0.01
4                     Hour        0.01
11                Hour_sin        0.01
12                Hour_cos        0.01
13           DayOfWeek_sin        0.01
6                DayOfWeek        0.01
14           DayOfWeek_cos        0.00
8                    Month        0.00
9                  Quarter        0.00
7                IsWeekend        0.00


 ### **Generating and Packaging Model Predictions**

In [86]:
# Making predictions using the RandomForest model on the test dataset
test_predictions = rf.predict(test)

# Save the predictions to a CSV file without headers or row indices
# This is often required in submission formats for machine learning competitions
pd.DataFrame(test_predictions).to_csv('predicted.csv', index=False, header=False)

# Loading the predictions to check the number of rows and ensure all data has been correctly predicted
pred = pd.read_csv('predicted.csv')
print("Number of predictions made:", pred.shape[0])

# Creating a zip file to package the predictions, commonly used for handling file submissions or storage efficiency
with ZipFile('predictions.zip', 'w') as zipf:
    zipf.write('predicted.csv', arcname='predicted.csv')


Number of predictions made: 60831
