# Exercise 1: Prediction Models

You will practice the basic steps to fit and to use a machine learning model.

In [7]:
# import packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb

In [8]:
# Load data
X_train = pd.read_csv("ex1_train.csv", header=None)
X_test = pd.read_csv("ex1_test.csv", header=None)
y_train = pd.read_csv("ex1_class_train.csv", header=None)
y_test = pd.read_csv("ex1_class_test.csv", header=None)

In [13]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,1.145637,-0.283479,-0.116762,0.485669,0.534493,2.285692,-0.405029,1.332325,2.173076,2.455118,...,0.012032,0.469294,3.101961,0.917121,1.18877,0.992547,0.584527,1.404714,1.097889,0.897418
1,0.44947,1.300194,-0.031868,1.902767,-0.915609,1.004107,1.140719,1.076203,2.173076,1.142071,...,1.655192,-0.302136,0.0,1.007296,1.035132,0.985755,0.520742,0.669645,0.833358,0.807968
2,1.11178,-1.114274,-0.291543,0.550086,0.932272,0.820526,1.069423,-0.58093,2.173076,1.268855,...,0.357653,-1.254923,0.0,1.127114,0.897337,0.98637,1.547303,0.855611,1.087128,1.044221
3,0.760769,-0.043883,0.343321,0.553254,0.896668,0.553399,-1.143741,0.636584,2.173076,0.985447,...,0.913979,-1.732147,0.0,0.78277,1.435926,0.986289,1.03735,0.974693,1.17235,0.996075
4,0.702023,0.105134,-0.798689,0.499557,-0.42987,1.200513,0.640653,1.02132,2.173076,0.736661,...,0.413452,1.499211,0.0,0.849873,0.859822,0.9959,1.166605,1.07825,0.849756,0.783117


In [19]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8800, 28), (2200, 28), (8800, 1), (2200, 1))

# Part 1: Default XGBoost Classifier

**TODO: Fit the model and predict for test data in the following cell**

In [10]:
# 1) create an XGBoost classifier instance
xgb_clf = xgb.XGBClassifier()

# 2) fit the classifier using X_train and y_train
train = xgb_clf.fit(X_train, y_train)

# 3) make prediction over X_test. The prediction output should be named y_pred_default
y_pred_default = xgb_clf.predict(X_test)

In [11]:
# Evaluate the default model
accuracy_default = accuracy_score(y_test, y_pred_default)
precision_default = precision_score(y_test, y_pred_default)
recall_default = recall_score(y_test, y_pred_default)
f1_default = f1_score(y_test, y_pred_default)

print("Default Model Performance:")
print(f"Accuracy: {accuracy_default:.4f}")
print(f"Precision: {precision_default:.4f}")
print(f"Recall: {recall_default:.4f}")
print(f"F1 Score: {f1_default:.4f}")

Default Model Performance:
Accuracy: 0.7032
Precision: 0.7216
Recall: 0.7186
F1 Score: 0.7201


You should achieve F1 score>0.65 to pass Part 1.

# Part 2: Hyperparameter Tuning with Cross-Validation

In [12]:
# Define candidate hyperparameters
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2]
}

**TODO: Find the best hyperparameters and use them to fit an improved classifier in the following cell**

In [20]:
# 1) use GridSearchCV to find the best hyperparameters
GridSearchCV = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, cv=5)

# 2) fit an XGBoost classifier using the best hyperparameters
GridSearchCV.fit(X_train, y_train)
# 3) make prediction over X_test. The prediction output should be named y_pred_tuned
y_pred_tuned = GridSearchCV.predict(X_test)

In [21]:
# Evaluate the tuned model
accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
precision_tuned = precision_score(y_test, y_pred_tuned)
recall_tuned = recall_score(y_test, y_pred_tuned)
f1_tuned = f1_score(y_test, y_pred_tuned)

print("Tuned Model Performance:")
print(f"Accuracy: {accuracy_tuned:.4f}")
print(f"Precision: {precision_tuned:.4f}")
print(f"Recall: {recall_tuned:.4f}")
print(f"F1 Score: {f1_tuned:.4f}")

# Analysis
print(f"Improvement in F1 Score: {f1_tuned - f1_default:.4f}")

Tuned Model Performance:
Accuracy: 0.7186
Precision: 0.7276
Recall: 0.7519
F1 Score: 0.7396
Improvement in F1 Score: 0.0195


To pass Part 2, your new F1 score should be higher 0.65 and the one in Part 1.