In [198]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# Import data

In [199]:
df = pd.read_csv("./data/boston.csv")

Input features in order:
1) CRIM: per capita crime rate by town
2) ZN: proportion of residential land zoned for lots over 25,000 sq.ft.
3) INDUS: proportion of non-retail business acres per town
4) CHAS: Charles River dummy variable (1 if tract bounds river; 0 otherwise)
5) NOX: nitric oxides concentration (parts per 10 million) [parts/10M]
6) RM: average number of rooms per dwelling
7) AGE: proportion of owner-occupied units built prior to 1940
8) DIS: weighted distances to five Boston employment centres
9) RAD: index of accessibility to radial highways
10) TAX: full-value property-tax rate per $10,000 [$/10k]
11) PTRATIO: pupil-teacher ratio by town
12) B: The result of the equation B=1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
13) LSTAT: % lower status of the population

Output variable:
1) MEDV: Median value of owner-occupied homes in $1000's [k$]



In [200]:
# get familiar with the data, check the shape, the first 5 rows, df.describe(), df.info()
df.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [201]:
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
529,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
530,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
531,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
532,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


In [202]:
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,534.0,533.0,534.0,534.0,532.0,531.0,533.0,531.0,531.0,533.0,533.0,531.0,533.0,533.0
mean,3.71089,10.9606,11.190187,0.067416,0.555249,6.276755,68.982176,3.769165,9.766478,410.801126,18.483114,356.759435,12.76728,697.230582
std,8.636097,22.852036,6.813075,0.250976,0.115924,0.703093,28.00878,2.093166,8.803199,169.406011,2.143362,91.114487,7.141796,11031.53634
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,-27.1
25%,0.082757,0.0,5.19,0.0,0.449,5.888,45.6,2.0754,4.0,280.0,17.4,375.27,7.18,16.5
50%,0.25651,0.0,9.69,0.0,0.538,6.216,77.3,3.1827,5.0,330.0,19.1,391.34,11.5,21.0
75%,3.846485,12.5,18.1,0.0,0.631,6.605,94.3,5.1167,24.0,666.0,20.2,396.235,17.11,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,190000.5


In [203]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 534 entries, 0 to 533
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     534 non-null    float64
 1   ZN       533 non-null    float64
 2   INDUS    534 non-null    float64
 3   CHAS     534 non-null    int64  
 4   NOX      532 non-null    float64
 5   RM       531 non-null    float64
 6   AGE      533 non-null    float64
 7   DIS      531 non-null    float64
 8   RAD      531 non-null    float64
 9   TAX      533 non-null    float64
 10  PTRATIO  533 non-null    float64
 11  B        531 non-null    float64
 12  LSTAT    533 non-null    float64
 13  MEDV     533 non-null    float64
dtypes: float64(13), int64(1)
memory usage: 58.5 KB


# Data cleaning

## Remove NaN values

In [204]:
df.dropna()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
529,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
530,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
531,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
532,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


# Handle outliers

In [205]:
# Did you notice anything weird in df.describe()? If not, check out MEDV's max value, the min value and the mean value. 
# What's going on? Is this a problem? How can you fix it?
df = df[df["MEDV"]<10000]
df = df[df["MEDV"]>0]

In [206]:
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
529,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
530,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
531,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
532,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


In [207]:
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,524.0,523.0,524.0,524.0,522.0,521.0,523.0,521.0,521.0,523.0,523.0,521.0,523.0,524.0
mean,3.69386,11.122371,11.087385,0.068702,0.553848,6.281161,68.627533,3.778596,9.681382,408.323136,18.471128,356.758925,12.669522,22.425191
std,8.688638,23.029182,6.786055,0.253189,0.116042,0.706679,28.109514,2.099807,8.760829,168.476213,2.143168,90.717018,7.125014,9.165006
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082155,0.0,5.175,0.0,0.448,5.888,45.05,2.0788,4.0,279.0,17.4,374.71,7.12,16.775
50%,0.252775,0.0,9.69,0.0,0.535,6.219,76.7,3.2157,5.0,330.0,19.1,391.34,11.38,21.15
75%,3.790445,12.5,18.1,0.0,0.624,6.618,94.2,5.1167,24.0,666.0,20.2,396.21,16.95,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


Vi ser at det er negativ verdi på mean medv. 
Sette en øvre og nedre grense på verdiene (mellom 0 og 100)

## Remove duplicates

In [208]:
# HINT: there is a function in pandas made just for this purpose
# HINT 2: https://google.gprivate.com/search.php?search?q=pandas+remove+duplicates
df = df.drop_duplicates()


In [209]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 497 entries, 0 to 533
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     497 non-null    float64
 1   ZN       496 non-null    float64
 2   INDUS    497 non-null    float64
 3   CHAS     497 non-null    int64  
 4   NOX      495 non-null    float64
 5   RM       494 non-null    float64
 6   AGE      496 non-null    float64
 7   DIS      494 non-null    float64
 8   RAD      494 non-null    float64
 9   TAX      496 non-null    float64
 10  PTRATIO  496 non-null    float64
 11  B        494 non-null    float64
 12  LSTAT    496 non-null    float64
 13  MEDV     497 non-null    float64
dtypes: float64(13), int64(1)
memory usage: 58.2 KB


# Model creation

In [210]:
# Create the XGBoost regression model. XGBoost stands for: eXtreme Gradient Boosting. 
# This is a very popular algorithm, used in machine learning competitions and in the industry. 
# We will use it for regression, but it can also be used for classification.

model = xgb.XGBRegressor()

In [211]:
# let y be the target column, and X be the rest of the df
X = df.drop(columns="MEDV")
y = df["MEDV"]

In [212]:
# Split the data into train and test sets with the function train_test_split from sklearn. Use test_size=0.2 and random_state=42
# We use train_test_split to split the data into train and test sets. We will use the train set to train the model, and the test set to evaluate the model.
# The reason we need a test set is to be able to evaluate the model. If we train the model on the whole dataset, 
# it will learn the dataset perfectly, but we will not know how it performs on unseen data. 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [213]:
X_train

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
60,0.15445,25.0,5.13,0,0.453,6.145,29.2,7.8148,8.0,284.0,19.7,390.68,6.86
240,0.53700,0.0,6.20,0,0.504,5.981,68.1,3.6715,8.0,307.0,17.4,378.35,11.65
296,0.01501,90.0,1.21,1,0.401,7.923,24.8,5.8850,1.0,198.0,13.6,395.52,3.16
520,0.17331,0.0,9.69,0,0.585,5.707,54.0,2.3817,6.0,391.0,19.2,396.90,12.01
233,0.61470,0.0,6.20,0,0.507,6.618,80.8,3.2721,8.0,307.0,17.4,396.90,7.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...
118,0.12802,0.0,8.56,0,0.520,6.474,97.1,2.4329,5.0,384.0,20.9,395.24,12.27
285,0.22188,20.0,6.96,1,0.464,7.691,51.8,4.3665,3.0,223.0,18.6,390.77,6.58
367,0.07244,60.0,1.69,0,0.411,5.884,18.5,10.7103,4.0,411.0,18.3,392.33,7.79
465,22.05110,0.0,18.10,0,0.740,5.818,92.4,1.8662,24.0,666.0,20.2,391.45,22.11


In [214]:
X_test

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
519,0.11132,0.0,27.74,0,0.609,5.983,83.5,2.1099,4.0,711.0,20.1,396.90,13.35
78,0.09512,0.0,12.83,0,0.437,6.286,45.0,4.5026,5.0,398.0,18.7,383.23,8.94
244,0.44791,0.0,6.20,1,0.507,6.726,66.5,3.6519,8.0,307.0,17.4,360.20,8.05
188,0.06642,0.0,4.05,0,0.510,,74.4,2.9153,5.0,296.0,16.6,391.27,6.92
251,0.11329,30.0,4.93,0,0.428,6.897,54.3,6.3361,6.0,300.0,16.6,391.25,11.38
...,...,...,...,...,...,...,...,...,...,...,...,...,...
261,0.14030,22.0,5.86,0,0.431,6.487,13.0,7.3967,7.0,330.0,19.1,396.28,5.90
403,22.59710,0.0,18.10,0,0.700,5.000,89.5,1.5184,24.0,666.0,20.2,396.90,31.99
23,0.85204,0.0,8.14,0,0.538,5.965,89.2,4.0123,4.0,307.0,21.0,392.53,13.83
339,0.19186,0.0,7.38,0,0.493,6.431,14.7,5.4159,5.0,287.0,19.6,393.68,5.08


In [216]:
# use the training set (X_train, y_train) to train the model by calling the .fit() method
model.fit(X_train, y_train)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [217]:
# Use the model to predict the target values for the test set (X_test)
preds = model.predict(X_test)

In [218]:
# find the mean squared error for the predictions (a value to see the value of the predictions, lower is better)
# find the error between the y_test and the preds
mse = mean_squared_error(y_test, preds)

In [219]:
# print the mse to see how much, on average, your model is off (squared)
mse

13.924668493742802

# Hyperparameter tuning

In [None]:
# These are some of the hyperparameters you can tune for XGBoost. 
# A hyperparameter is a parameter that is not learned by the model, but is set by the user.
# The parameters that are learned by the model are called model parameters.
# The model starts off with some default values for the hyperparameters, but you can change them to get potentially better results.
# This process is called hyperparameter tuning.

# If you want, you can adjust the hyperparameters and see if you can get a better result. You can also add more hyperparameters to the dictionary.
# List of hyperparameters: https://xgboost.readthedocs.io/en/latest/parameter.html
params = {
    "learning_rate": [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
    "max_depth": [3, 4, 5, 6, 8, 10, 12, 15],
    "min_child_weight": [1, 3, 5, 7],
    "gamma": [0.0, 0.1, 0.2, 0.3, 0.4],
    "colsample_bytree": [0.3, 0.4, 0.5, 0.7],
    "n_estimators": [100, 200, 300, 400, 500, 900, 1100, 1500],
}

In [None]:
# Use RandomizedSearchCV to find the best hyperparameters for the model. There are other ways to do this, but random search will work for this purpose.
# Random search is a method for hyperparameter tuning that will try a given number of random combinations of hyperparameters.
# Use the training set (X_train, y_train) to instantiate the random search by calling the .fit() method with the test set
# HINT: n_iter is the number of iterations to run the random search, if this number is too high, it will take a long time to run, 
# but if it's too low, it will not find the best hyperparameters. You should try to find a happy medium.

# First, create a new, similar model, but with the default hyperparameters. Do not fit this model with the training set.
model2 = 

random_search = RandomizedSearchCV(?, param_distributions=params, n_iter=?, scoring="neg_mean_squared_error", n_jobs=-1, cv=5)

# Fit the model with x and y train sets
random_search.fit(?, ?)

In [None]:
# Retrieve the best model/estimator from the random search
model_new = ?

In [None]:
# Create new predictions with the new model
preds = 

In [None]:
# Get the new mean square error
mse_new = ?

mse_new

In [None]:
print(f"relation between better error on the new model and the old error: {(mse_new / mse)}")

# If the new model did not perform better, this means that the default hyperparameters were better, but it is highly likely that even better ones exist.
# You can try to run the random search again, but with more iterations, or you can try to use GridSearchCV instead of RandomizedSearchCV ot test _every_ combination of hyperparameters.
# You can also edit the hyperparameters in the dictionary to see if you can get better results.