Student: Arne Cools - IAI3 - 2022/2023

# Model Evaluation & Hyperparameter Tuning
## Part 2 - Model Evaluation & Hyperparameter Tuning

In [38]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.feature_selection import f_regression, SelectPercentile
from sklearn.model_selection import train_test_split, cross_val_score, KFold, cross_validate, RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from IPython.core.display_functions import display
from sklearn.metrics import r2_score

In [2]:
# First we make a method to return the processed dataset
def categorize_price(price):
    if price == 0:
        return "FREE"
    elif 0 < price < 5.0:
        return "CHEAP"
    else:
        return "EXPENSIVE"

def remove_outliers(df,columns,n_std):
    for col in columns:

        mean = df[col].mean()
        sd = df[col].std()

        df = df[(df[col] <= mean+(n_std*sd))]

    return df

def return_dataset():
    apps = pd.read_csv('../googleplaystore.csv')
    apps['Reviews'] = pd.to_numeric(apps['Reviews'], errors='coerce')
    apps['Size'] = apps['Size'].apply(lambda x: float(x.replace('k','')) / 1024 if 'k' in str(x) else x)  # Convert k to M
    apps['Size'] = apps['Size'].apply(lambda x: None if x == 'Varies with device' else x) # Handles "Varies with device" values
    apps['Size'] = apps['Size'].str.replace('M', '').str.replace('k', '').astype(float)
    apps['Installs'] = apps['Installs'].str.replace('+', '').str.replace(',', '').astype('int64')
    apps['Price'] = apps['Price'].str.replace('$', '').astype(float)
    apps['Last Updated'] = pd.to_datetime(apps['Last Updated'])
    apps = apps.drop_duplicates(subset='App', keep='first')
    apps = remove_outliers(apps, ['Price'], 3)
    apps = apps.dropna(subset=['Rating'])
    apps['Size'] = apps['Size'].fillna(0)
    encoder = LabelEncoder()
    apps['Category_Encoded'] = encoder.fit_transform(apps['Category'])
    apps['Content Rating_Encoded'] = encoder.fit_transform(apps['Content Rating'])
    apps['Android Ver_Encoded'] = encoder.fit_transform(apps['Android Ver'])
    current_date = pd.to_datetime('2019-01-01')  # All data is from before 2019
    apps['Time Since Last Update'] = (current_date - apps['Last Updated']).dt.days
    apps['App_Name_Length'] = apps['App'].apply(len)
    apps['Price_Category_Labels'] = apps['Price'].apply(categorize_price)
    apps['Price_Category_Encoded'] = apps['Price_Category_Labels'].map({'FREE': 0, 'CHEAP': 1, 'EXPENSIVE': 2})
    numeric_columns = apps.select_dtypes(include=['number'])
    numeric_columns = numeric_columns.drop(['Price'], axis=1)
    apps = apps[numeric_columns.columns]
    scaler = StandardScaler()
    numeric_columns = ['Reviews', 'Size', 'Installs', 'Category_Encoded', 'Content Rating_Encoded','Android Ver_Encoded', 'Time Since Last Update', 'App_Name_Length', 'Price_Category_Encoded']
    apps[numeric_columns] = scaler.fit_transform(apps[numeric_columns])
    selector = SelectPercentile(score_func=f_regression, percentile=80)
    X = selector.fit_transform(apps.select_dtypes(include=np.number), apps.Rating)
    best_features = selector.get_support(indices=True)
    apps = apps.select_dtypes(include=np.number).iloc[:, best_features]
    return apps



In [3]:
apps = return_dataset()
x = apps.drop('Rating', axis=1)
y = apps.Rating
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
print(f'Training set: {x_train.shape}, {y_train.shape}')
print(f'Test set: {x_test.shape}, {y_test.shape}')

Training set: (5726, 7), (5726,)
Test set: (2454, 7), (2454,)


## 2.1 Model Evaluation

### Linear Regression model

In [60]:
# Initialize and train the linear regression model
linear_model = LinearRegression()
linear_model.fit(x_train, y_train)

print(f'Train Accuracy: {linear_model.score(x_train, y_train):.2f}')
print(f'Holdout Accuracy: {linear_model.score(x_test, y_test):.2f}')

# With a linear regression model I am only getting a 4% training accuracy and 3% testing accuracy

Train Accuracy: 0.04
Holdout Accuracy: 0.03


### K Nearest Neighbors

In [61]:

for i in range(1, 40):
    neighbors_model = KNeighborsRegressor(n_neighbors=i)
    neighbors_model.fit(x_train, y_train)
    # print(f'{i} neighbors - Train Accuracy: {neighbors_model.score(x_train, y_train):.2f}')
    print(f'{i} neighbors - Holdout Accuracy: {neighbors_model.score(x_test, y_test):.2f}')

# As the number of neighbors (n_neighbors) increases, the model becomes less complex and tends to generalize better. This is evident from the decreasing accuracy scores for both the training and test sets. However, the decrease in the holdout R-squared values suggests that the model might be underfitting the data. We continue experimenting with other models

# The bad results are not completely unexpected since k neigherst neighbors isn't fitted for categorical data because the distances have to mean something

1 neighbors - Holdout Accuracy: -0.80
2 neighbors - Holdout Accuracy: -0.33
3 neighbors - Holdout Accuracy: -0.20
4 neighbors - Holdout Accuracy: -0.14
5 neighbors - Holdout Accuracy: -0.11
6 neighbors - Holdout Accuracy: -0.07
7 neighbors - Holdout Accuracy: -0.04
8 neighbors - Holdout Accuracy: -0.02
9 neighbors - Holdout Accuracy: -0.01
10 neighbors - Holdout Accuracy: -0.00
11 neighbors - Holdout Accuracy: 0.01
12 neighbors - Holdout Accuracy: 0.01
13 neighbors - Holdout Accuracy: 0.02
14 neighbors - Holdout Accuracy: 0.02
15 neighbors - Holdout Accuracy: 0.02
16 neighbors - Holdout Accuracy: 0.02
17 neighbors - Holdout Accuracy: 0.03
18 neighbors - Holdout Accuracy: 0.03
19 neighbors - Holdout Accuracy: 0.03
20 neighbors - Holdout Accuracy: 0.03
21 neighbors - Holdout Accuracy: 0.03
22 neighbors - Holdout Accuracy: 0.03
23 neighbors - Holdout Accuracy: 0.04
24 neighbors - Holdout Accuracy: 0.04
25 neighbors - Holdout Accuracy: 0.04
26 neighbors - Holdout Accuracy: 0.04
27 neighbor

### Desicion Tree

In [40]:
leaf_node_sizes = np.logspace(1.0, 4.0, num=100, dtype=int)
for max_leaf_nodes in leaf_node_sizes:
    tree_model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    tree_model.fit(x_train, y_train)
    #print(f'{max_leaf_nodes} max nodes - Train Accuracy: {tree_model.score(X_train, y_train):.2f}')
    print(f'{max_leaf_nodes} max nodes - Holdout Accuracy: {tree_model.score(x_test, y_test):.2f}')


# With an accuracy of 6% we have a better model than the above, but it's still not good enough to work with.

10 max nodes - Holdout Accuracy: 0.09
10 max nodes - Holdout Accuracy: 0.09
11 max nodes - Holdout Accuracy: 0.09
12 max nodes - Holdout Accuracy: 0.09
13 max nodes - Holdout Accuracy: 0.10
14 max nodes - Holdout Accuracy: 0.10
15 max nodes - Holdout Accuracy: 0.09
16 max nodes - Holdout Accuracy: 0.10
17 max nodes - Holdout Accuracy: 0.10
18 max nodes - Holdout Accuracy: 0.10
20 max nodes - Holdout Accuracy: 0.10
21 max nodes - Holdout Accuracy: 0.09
23 max nodes - Holdout Accuracy: 0.08
24 max nodes - Holdout Accuracy: 0.08
26 max nodes - Holdout Accuracy: 0.09
28 max nodes - Holdout Accuracy: 0.10
30 max nodes - Holdout Accuracy: 0.09
32 max nodes - Holdout Accuracy: 0.09
35 max nodes - Holdout Accuracy: 0.09
37 max nodes - Holdout Accuracy: 0.09
40 max nodes - Holdout Accuracy: 0.07
43 max nodes - Holdout Accuracy: 0.06
46 max nodes - Holdout Accuracy: 0.06
49 max nodes - Holdout Accuracy: 0.05
53 max nodes - Holdout Accuracy: 0.04
57 max nodes - Holdout Accuracy: -0.02
61 max node

### Random Forest
Random forests can handle both categorical and numeric features well. They can capture complex interactions between features and handle nonlinearity. Random forests tend to perform well out of the box and are less prone to overfitting.

In [68]:
for t in range(1,100):
    rf_model = RandomForestRegressor(n_estimators=t)
    rf_model.fit(x_train, y_train)

    y_pred = rf_model.predict(x_test)

    print(f"{t} : {rf_model.score(x_test, y_test)}")

# with an accuracy of 12.5% on 93 trees this is the best model so far

1 : -0.6780580272053938
2 : -0.36559306813578374
3 : -0.12759913440991122
4 : -0.11542030644210177
5 : -0.02115803284364448
6 : 0.007488000008954887
7 : 1.3929689003933099e-05
8 : 0.03689938282229521
9 : 0.013832539539511246
10 : 0.030842792275967423
11 : 0.047245624504731354
12 : 0.04097104781494221
13 : 0.053236051224179515
14 : 0.06488154037867211
15 : 0.04261480891155478
16 : 0.06934529909275311
17 : 0.06487502497763786
18 : 0.05969882256159598
19 : 0.09740667520610025
20 : 0.07452636961296077
21 : 0.08895012566419613
22 : 0.09156929072416542
23 : 0.08068089454132044
24 : 0.06359326253479303
25 : 0.09653188696662096
26 : 0.08383793981628163
27 : 0.10265218989703462
28 : 0.09034149229909938
29 : 0.1002848586596019
30 : 0.09937355178341978
31 : 0.12087350374527217
32 : 0.10609874284962106
33 : 0.11020095938849472
34 : 0.10402216060281189
35 : 0.10944067187518591
36 : 0.10681013559283081
37 : 0.08866291025675255
38 : 0.09576869263888999
39 : 0.10880784465262672
40 : 0.1016456754126050

### Gradient Boosting Regression

In [73]:
gb_regressor = GradientBoostingRegressor(n_estimators=100, max_depth=3, random_state=42)

gb_regressor.fit(x_train, y_train)

print(f"{gb_regressor.score(x_test, y_test)}")

# Another better model then before with 15%

0.15084172173541377


### Support Vector Regression

In [57]:
svr_model = SVR(kernel='linear')  # You can try different kernels ('precomputed', 'linear', 'rbf', 'poly', 'sigmoid')
svr_model.fit(x_train, y_train)

print(f"{svr_model.score(x_test, y_test)}")

# After trying several different kernels, 3% was the best accuracy I could get out of it


0.03924091345193914


## 2.1 Model Selection
We will work with the models that gave us somewhat decent results: Random Forest and Gradient Boosting

To make a further selection I will use cross validation

In [9]:
# Cross-Validation
gb_model = GradientBoostingRegressor(n_estimators=100, max_depth=3)
gb_model.fit(x_train, y_train)

rf_model = RandomForestRegressor(n_estimators=93)
rf_model.fit(x_train, y_train)

gb_scores = cross_val_score(gb_model, x, y, cv=6)
rf_scores = cross_val_score(rf_model, x, y, cv=6)
print(f'6-Fold Cross-Validation Accuracy GB Model: {gb_scores.mean():.2f} (+/- {gb_scores.std():.2f})')
print(f'6-Fold Cross-Validation Accuracy RF Model: {rf_scores.mean():.2f} (+/- {rf_scores.std():.2f})')

6-Fold Cross-Validation Accuracy GB Model: 0.13 (+/- 0.03)
6-Fold Cross-Validation Accuracy RF Model: 0.08 (+/- 0.05)


In [10]:
gb_scores = cross_validate(gb_model, x, y, cv=6, return_train_score=True)
pd.DataFrame(gb_scores)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,1.115609,0.004342,0.069506,0.258293
1,1.023162,0.002735,0.147638,0.253997
2,1.007517,0.003248,0.179756,0.255011
3,1.010035,0.003117,0.109422,0.280144
4,1.048788,0.002744,0.131074,0.27124
5,1.012796,0.003284,0.122572,0.268184


In [11]:
rf_scores = cross_validate(rf_model, x, y, cv=6, return_train_score=True)
pd.DataFrame(rf_scores)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,7.121891,0.02679,0.016377,0.879056
1,6.450706,0.022077,0.034239,0.877813
2,6.878714,0.026408,0.166155,0.877349
3,6.702561,0.047703,0.038693,0.883215
4,6.524832,0.024896,0.097238,0.878812
5,6.526302,0.026547,0.084213,0.878473


In [12]:
# K-Fold Cross-Validation
gb_model = GradientBoostingRegressor(n_estimators=100, max_depth=3)
gb_model.fit(x_train, y_train)

rf_model = RandomForestRegressor(n_estimators=93)
rf_model.fit(x_train, y_train)

kf = KFold(n_splits=6, shuffle=True)
gb_scores = cross_val_score(gb_model, x, y, cv=kf)
rf_scores = cross_val_score(rf_model, x, y, cv=kf)
print(f'6-Fold Cross-Validation Accuracy GB Model: {gb_scores.mean():.2f} (+/- {gb_scores.std():.2f})')
print(f'6-Fold Cross-Validation Accuracy RF Model: {rf_scores.mean():.2f} (+/- {rf_scores.std():.2f})')

6-Fold Cross-Validation Accuracy GB Model: 0.15 (+/- 0.03)
6-Fold Cross-Validation Accuracy RF Model: 0.13 (+/- 0.03)


In [13]:
gb_scores = cross_validate(gb_model, x, y, cv=6, return_train_score=True)
pd.DataFrame(gb_scores)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,1.167594,0.005146,0.069506,0.258293
1,1.110326,0.002868,0.147638,0.253997
2,1.334771,0.003204,0.179756,0.255011
3,1.105913,0.002734,0.107601,0.280144
4,1.081942,0.003272,0.131574,0.27124
5,1.050316,0.003351,0.119276,0.268184


In [14]:
rf_scores = cross_validate(rf_model, x, y, cv=6, return_train_score=True)
pd.DataFrame(rf_scores)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,6.914356,0.025474,0.015448,0.880333
1,6.842549,0.022966,0.022423,0.877533
2,7.039722,0.03602,0.165204,0.879229
3,7.066803,0.024077,0.048889,0.880021
4,6.648202,0.025631,0.112954,0.878893
5,6.340713,0.035141,0.080061,0.878171


Since I am looking for a model that gives me the most accurate predictions, I am mostly looking at the 'test_score' column because this shows how well the model performs on a dataset where it's not trained on.
I will therefore choose the Gradient Boosting Regressor model.

## Hyperparameter Tuning

### Preparations

In [17]:
model = Pipeline([('poly', PolynomialFeatures()),
                  ('gradboost', GradientBoostingRegressor())])
model

In [18]:
#parameters
display(pd.DataFrame(model['poly'].get_params(), index=['poly']))
display(pd.DataFrame(model['gradboost'].get_params(), index=['gradboost']))
pd.set_option('display.max_rows', None)
#hyperparameters
pd.DataFrame([model.get_params().keys(), model.get_params().values()], index=['key', 'value']).T

Unnamed: 0,degree,include_bias,interaction_only,order
poly,2,True,False,C


Unnamed: 0,alpha,ccp_alpha,criterion,init,learning_rate,loss,max_depth,max_features,max_leaf_nodes,min_impurity_decrease,...,min_samples_split,min_weight_fraction_leaf,n_estimators,n_iter_no_change,random_state,subsample,tol,validation_fraction,verbose,warm_start
gradboost,0.9,0.0,friedman_mse,,0.1,squared_error,3,,,0.0,...,2,0.0,100,,,1.0,0.0001,0.1,0,False


Unnamed: 0,key,value
0,memory,
1,steps,"[(poly, PolynomialFeatures()), (gradboost, Gra..."
2,verbose,False
3,poly,PolynomialFeatures()
4,gradboost,GradientBoostingRegressor()
5,poly__degree,2
6,poly__include_bias,True
7,poly__interaction_only,False
8,poly__order,C
9,gradboost__alpha,0.9



Hyperparameters I will tune are: "poly__degree", "gradboost__criterion", "gradboost__max_depth", "gradboost__n_estimators"

poly__degree = determines the degree of the polynomial
gradboost__max_depth = Maximum depth of the individual regression estimators. The maximum depth limits the number of nodes in the tree.
gradboost__n_estimators = The number of boosting stages to perform. Gradient boosting is fairly robust to over-fitting so a large number usually results in better performance.

### Random Search

In [30]:
hyperparameters = {
    # 'poly__degree': [3,5],
    'gradboost__max_depth': [1,3,5,10,15,30,50],
    'gradboost__n_estimators':[i * 10 for i in range(1, 12, 2)]
}

randomsearch = RandomizedSearchCV(model, hyperparameters, cv=5, n_iter=5, verbose=1, n_jobs= -1, )
randomsearch.fit(x, y)
display(randomsearch.best_estimator_)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [33]:
r2_score(y, randomsearch.best_estimator_.predict(x))

0.31341848958472107

With a Random Search we find that we get the best model with max_depth = 5 and n_estimators = 30 with an r2 score of 31%

### Grid Search

In [45]:
hyperparameters = {
    # 'poly__degree': [3,5],
    'gradboost__max_depth': [1,3,5,10,15,30,50],
    'gradboost__n_estimators':[i * 10 for i in range(1, 12, 2)]
}

gridsearch = GridSearchCV(model, hyperparameters, cv=4, verbose=1, n_jobs=-1)
gridsearch.fit(x, y)
display(gridsearch.best_estimator_)

Fitting 2 folds for each of 42 candidates, totalling 84 fits


In [46]:
r2_score(y, gridsearch.best_estimator_.predict(x))

0.2920558451469526

With a Grid Search we find that we get the best model with max_depth = 5 and n_estimators = 110 with an r2 score of 29%