# 771948_A23_T3A - Group Work Assignment #
# Part 1 - Numerical and categorical feature classification problem #
## Assignment by Chris Mintz 202369825 and Antonia Agunbiade 202375309 ##

# Task 1 - Load and preprocess the dataset for the classification problem (handle missing data, convert categorical features to numerical features) #

In [35]:
import pandas as pd
import numpy as np
import datetime as dt
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

df = pd.read_excel('dataset1.xlsx')

df.head(20)

Unnamed: 0,target,var1,var2,var3,var4,var5,var6,var7
0,0,509.18,417.681,Micronesia,138.0,393.0,no,2019-07-20 13:21:37
1,0,446.06,666.182,Dominica,81.0,352.05,yes,2019-04-04 21:30:46
2,1,235.5,398.097,Isle of Man,90.0,339.0,no,2019-03-03 02:59:37
3,0,306.02,518.163,Turkmenistan,102.0,439.25,yes,2019-03-19 08:00:58
4,0,453.08,600.156,Cameroon,105.0,422.95,no,2019-03-18 13:22:35
5,1,211.72,506.716,Liechtenstein,111.0,310.6,no,2019-03-18 13:00:12
6,0,401.42,627.294,French Guiana,78.0,390.05,no,2019-03-28 02:29:19
7,0,498.9,525.207,Barbados,129.0,408.75,yes,2019-06-07 05:41:16
8,1,257.9,651.209,Netherlands,147.0,280.2,no,2019-02-07 08:02:31
9,1,283.04,467.801,Chad,69.0,272.35,yes,2019-03-26 19:37:46


In [36]:
# have a look at the data types
df.info()

# quick look at the data
df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 925 entries, 0 to 924
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   target  925 non-null    int64  
 1   var1    925 non-null    float64
 2   var2    925 non-null    float64
 3   var3    925 non-null    object 
 4   var4    325 non-null    float64
 5   var5    925 non-null    float64
 6   var6    925 non-null    object 
 7   var7    925 non-null    object 
dtypes: float64(4), int64(1), object(3)
memory usage: 57.9+ KB


Unnamed: 0,target,var1,var2,var4,var5
count,925.0,925.0,925.0,325.0,925.0
mean,0.496216,360.116562,548.390134,108.452308,325.393946
std,0.500256,87.866662,135.22146,26.325744,78.862779
min,0.0,209.56,139.965,57.0,163.0
25%,0.0,278.64,467.373,87.0,257.9
50%,0.0,367.64,569.841,105.0,342.35
75%,1.0,437.6,652.278,126.0,393.0
max,1.0,539.92,794.848,180.0,457.15


### <font color='yellow'>About this dataset</font> ###
#### var1, var2, var3, var4, var5, var6, var7 columns are features ####
#### var1, var2, var4, var5 are numerical values ####
#### var3, var6 columns are categorical values #### 
#### var 7 is a datetime ####
#### target column is the label ####

### We have null values in the data so lets enumerate them ####

In [37]:
# convert the date to proper datetime. Using coerce because there are errors in the dates
df.isnull().sum()

target      0
var1        0
var2        0
var3        0
var4      600
var5        0
var6        0
var7        0
dtype: int64

In [38]:
# some investigation shows 5 cells with an illegal datetime in them. Specifically, 5 cells have a date of 2019-02-29 which is not a valid date
def fix_not_leap_year(bad_date):
    if '2019-02-29' in str(bad_date):
        bad_date = str(bad_date).replace('2019-02-29', '2019-02-28')
        print(bad_date)
        return bad_date
    else:
        return bad_date

df['var7'] = df['var7'].apply(fix_not_leap_year)


2019-02-28 12:31:57
2019-02-28 18:06:21
2019-02-28 11:00:06
2019-02-28 23:56:06
2019-02-28 19:26:35


In [39]:
# because the the dataset is not a linear regression problem, we'll convert the datetime to features with a custom transformer
# this will allow us to extract the day, month, year and time as separate features.
# define a custom transformer function to extract datetime features
def extract_datetime_features(dates):
    dates = pd.to_datetime(dates, infer_datetime_format=True, errors='coerce')
    return pd.DataFrame({
        'year': dates.dt.year,
        'month': dates.dt.month,
        'day': dates.dt.day,
        'dayofweek': dates.dt.dayofweek,
        'hour': dates.dt.hour
    })

In [40]:
# test the datetime transformer function
date_df = extract_datetime_features(df['var7'])
print(date_df)

     year  month  day  dayofweek  hour
0    2019      7   20          5    13
1    2019      4    4          3    21
2    2019      3    3          6     2
3    2019      3   19          1     8
4    2019      3   18          0    13
..    ...    ...  ...        ...   ...
920  2019      1   22          1    12
921  2019      2    1          4    14
922  2019      6   13          3    18
923  2019      5   27          0     6
924  2019      6   18          1    17

[925 rows x 5 columns]


  dates = pd.to_datetime(dates, infer_datetime_format=True, errors='coerce')


In [41]:
# DOCS When trying to deal with the datetime, we decided to convert the datetime to numerical features before putting it into the transform/fit process.
# This is because this is not a linear regression problem but a classification problem and we want to extract the day, month, year and time as separate features.
df_combined = pd.concat([df, date_df], axis=1)
print(df_combined)

     target    var1     var2          var3   var4    var5 var6  \
0         0  509.18  417.681    Micronesia  138.0  393.00   no   
1         0  446.06  666.182      Dominica   81.0  352.05  yes   
2         1  235.50  398.097   Isle of Man   90.0  339.00   no   
3         0  306.02  518.163  Turkmenistan  102.0  439.25  yes   
4         0  453.08  600.156      Cameroon  105.0  422.95   no   
..      ...     ...      ...           ...    ...     ...  ...   
920       0  422.34  547.259       Belarus    NaN  350.45  yes   
921       1  342.62  473.919         Japan    NaN  200.85   no   
922       1  265.10  538.170  Saint Martin    NaN  208.35  yes   
923       0  397.12  622.386          Chad    NaN  433.45  yes   
924       1  242.10  420.429       Albania    NaN  369.20  yes   

                    var7  year  month  day  dayofweek  hour  
0    2019-07-20 13:21:37  2019      7   20          5    13  
1    2019-04-04 21:30:46  2019      4    4          3    21  
2    2019-03-03 02:59

In [42]:
# There are too many NA in var4 to simply remove those rows so we will try to impute the missing values #
# We will have to watch the weighting of var4 feature as it will be heavily weighted to the interpolated values #

# TO DO: I don't like the interpolation of the var 4 feature. Hoping to use an algorithm that supports null data or find a regression pattern to better fit OR it's possible this is not used as feature data at all.

# Data normalization #
numerical_features = ['var1', 'var2', 'var4', 'var5', 'year', 'month', 'day', 'dayofweek', 'hour']
categorical_features = ['var3', 'var6']


preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numerical_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features)
    ])

# the result of this will be a Compressed Sparse Row (CSR) matrix which works with XGBoost and scikit-learn.
df_transformed = preprocessor.fit_transform(df_combined)

# Task 2 - Build a classifier for the classification problem using one of the specified models (logistic regression, decision trees, random forests, or artificial neural networks) #

### 
<b>Refereces</b>

DataCamp. (2023). Learn XGBoost in Python: A Step-by-Step Tutorial. Available at: https://www.datacamp.com/tutorial/xgboost-in-python [Accessed 19 Aug. 2024].

XGBoost Contributors. (2024). XGBoost Parameters — xgboost 2.1.1 documentation. Available at: https://xgboost.readthedocs.io/en/stable/parameter.html [Accessed 19 Aug. 2024].

###

In [59]:
# going to use XGBoost to train a decision tree model
# we will use the transformed data from the preprocessor as the input data
# target variable is the 'target' column from original df dataset
import xgboost as xgb
from sklearn.model_selection import train_test_split

# setup our features and labels
X = df_transformed
y = df['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# We're going to use the DMatrix data structure from XGBoost. This is an optimized data structure that works with XGBoost and is optimized for memory and speed
# Create the regression matrices in DMatrix format
dtrain_reg = xgb.DMatrix(X_train, y_train)
dtest_reg = xgb.DMatrix(X_test, y_test)


# Now set parameters for XGBoost
params = {
    'objective': 'binary:logistic',  
    'max_depth': 6,
    'eta': 0.3,
    'eval_metric': 'rmse',
    'tree_method': 'hist'
}

# We'll look at the ideal number of rounds in hyperparameter tuning but for now let's use 100 rounds
n = 100
model = xgb.train(
    params, 
    dtrain_reg, 
    n
    )

# Task 3 - Fine tune the selected model using appropriate techniques like hyperparameter tuning, cross-validation, etc.

In [44]:
# let's look at the model's performance and see if any hyperparameter tuning is needed
from sklearn.metrics import root_mean_squared_error

predictions = model.predict(dtest_reg)
# compare the predictions to the actual values
rmse = root_mean_squared_error(y_test, predictions)
print(f'RMSE of the base model: {rmse: .3f}')

RMSE of the base model:  0.260


In [45]:
# looking to improve on the RMSE by using validation sets
# XGBoost has a built-in cross-validation function that we can use to evaluate the model
# using early_stopping_rounds we can stop the training if the model stops improving
cv_results = xgb.cv(
    params,
    dtrain_reg,
    num_boost_round=200,
    seed=0,
    nfold=5,
    metrics={'rmse'},
    early_stopping_rounds=10
)

print(cv_results)

    train-rmse-mean  train-rmse-std  test-rmse-mean  test-rmse-std
0          0.359950        0.001390        0.373904       0.002923
1          0.261928        0.000928        0.293481       0.006864
2          0.194283        0.002384        0.246405       0.012310
3          0.146789        0.002882        0.220290       0.015217
4          0.113376        0.003406        0.207328       0.018781
5          0.090349        0.002964        0.200462       0.019948
6          0.073828        0.002580        0.196181       0.021589
7          0.062180        0.002946        0.193530       0.022102
8          0.052818        0.003235        0.192888       0.023142
9          0.045744        0.002899        0.192215       0.023588
10         0.040799        0.003040        0.191935       0.023863
11         0.037110        0.003348        0.191977       0.024354
12         0.034093        0.003328        0.192074       0.024766
13         0.031694        0.003509        0.191736       0.02

In [60]:
# pull out the best RMSE from the cv_results
best_rmse = cv_results['test-rmse-mean'].min()
print(f'Best tested RMSE: {best_rmse: .3f}')

Best RMSE:  0.192


In [71]:
# Now use a hyperparameter grid to test different hyperparameters and effectiveness
from sklearn.model_selection import StratifiedKFold, GridSearchCV

param_grid = {
    'learning_rate': [0.1, 0.3, 0.5, 1],
    'max_depth': [3, 6, 9],
    'min_child_weight': [1, 2, 3, 4, 5],
    'n_estimators': [100, 200, 300],
    'gamma': [0, 0.1, 0.2, 0.3],
    'subsample': [0.5, 0.7, 1]
}

param_model = xgb.XGBRegressor()

param_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(param_model, param_grid, cv=param_cv, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X, y)

# Print the best hyperparameters
print(f'Best learning rate: {grid_search.best_params_["learning_rate"]}')
print(f'Best max depth: {grid_search.best_params_["max_depth"]}')
print(f'Best min child weight: {grid_search.best_params_["min_child_weight"]}')
print(f'Best number of estimators: {grid_search.best_params_["n_estimators"]}')
print(f'Best gamma: {grid_search.best_params_["gamma"]}')
print(f'Best subsample: {grid_search.best_params_["subsample"]}')


Best learning rate: 0.1
Best max depth: 9
Best min child weight: 5
Best number of estimators: 100
Best gamma: 0.1
Best gamma: 0.5
Best subsample: 0.7


Parameters: { "col_sample_bytree" } are not used.



In [73]:
#now apply the new hyperparameters to the re-fit the model
#note, eta = learning rate
params_tuned = {
    'objective': 'binary:logistic',  
    'max_depth': 9,
    'eta': 0.3,
    'gamma': 0.3,
    'subsample': 0.7,
    'min_child_weight': 5,
    'eval_metric': 'rmse',
    'tree_method': 'hist'
}

evals = [(dtrain_reg, 'train'), (dtest_reg, 'validation')]

model = xgb.train(
    params_tuned,
    dtrain_reg,
    num_boost_round=200,
    evals=evals,
    early_stopping_rounds=10
)

[0]	train-rmse:0.38635	validation-rmse:0.39255
[1]	train-rmse:0.30833	validation-rmse:0.32819
[2]	train-rmse:0.26084	validation-rmse:0.28579
[3]	train-rmse:0.22654	validation-rmse:0.26239
[4]	train-rmse:0.20616	validation-rmse:0.24722
[5]	train-rmse:0.19337	validation-rmse:0.24172
[6]	train-rmse:0.18684	validation-rmse:0.23680
[7]	train-rmse:0.18210	validation-rmse:0.23460
[8]	train-rmse:0.17725	validation-rmse:0.23058
[9]	train-rmse:0.17319	validation-rmse:0.23193
[10]	train-rmse:0.17026	validation-rmse:0.22581
[11]	train-rmse:0.16837	validation-rmse:0.22417
[12]	train-rmse:0.16773	validation-rmse:0.21873
[13]	train-rmse:0.16750	validation-rmse:0.22066
[14]	train-rmse:0.16747	validation-rmse:0.22155
[15]	train-rmse:0.16667	validation-rmse:0.21950
[16]	train-rmse:0.16683	validation-rmse:0.22186
[17]	train-rmse:0.16640	validation-rmse:0.22166
[18]	train-rmse:0.16551	validation-rmse:0.21709
[19]	train-rmse:0.16550	validation-rmse:0.21512
[20]	train-rmse:0.16494	validation-rmse:0.21724
[2

## <b>Observation</b>
The training has early-stopped at less than 200 rounds. It is here that the training loss is low but also takes the slightly fluctuating validation loss into account which we want to watch to ensure we are not getting into overfitting with too many rounds.
## 

In [53]:
# try a very high number of rounds to see if we can get a better RMSE for validation
# if overfitting shows up we'll stick with the original model

n_b = 1000

model_b = xgb.train(
    params_tuned, 
    dtrain_reg, 
    n_b, 
    evals=evals, 
    verbose_eval=50
    )

[0]	train-rmse:0.36328	validation-rmse:0.38229
[50]	train-rmse:0.02804	validation-rmse:0.23320
[100]	train-rmse:0.01314	validation-rmse:0.23394
[150]	train-rmse:0.00644	validation-rmse:0.23437
[200]	train-rmse:0.00355	validation-rmse:0.23438
[250]	train-rmse:0.00185	validation-rmse:0.23444
[300]	train-rmse:0.00107	validation-rmse:0.23445
[350]	train-rmse:0.00068	validation-rmse:0.23449
[400]	train-rmse:0.00059	validation-rmse:0.23450
[450]	train-rmse:0.00057	validation-rmse:0.23449
[500]	train-rmse:0.00056	validation-rmse:0.23449
[550]	train-rmse:0.00055	validation-rmse:0.23450
[600]	train-rmse:0.00053	validation-rmse:0.23450
[650]	train-rmse:0.00053	validation-rmse:0.23450
[700]	train-rmse:0.00053	validation-rmse:0.23449
[750]	train-rmse:0.00053	validation-rmse:0.23449
[800]	train-rmse:0.00053	validation-rmse:0.23449
[850]	train-rmse:0.00053	validation-rmse:0.23449
[900]	train-rmse:0.00053	validation-rmse:0.23449
[950]	train-rmse:0.00053	validation-rmse:0.23449
[999]	train-rmse:0.0005

## <b>Observation</b>
Interesting attempt to improve the model but as was shown in early-stop, there is little validation loss improvement even by the time the model reaches 50 rounds.
##

# Task 4 - Visualise the dataset and the model's results, where applicable like feature importance, confusion matrix, etc

# Task 5 - Report the final performance of the selected model using the appropriate performance metrics like accuracy, F1-score, etc