In [1]:
# Data Manipulation
import numpy as np
import pandas as pd

# Data Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Lists' manipulations
import itertools

# Gaussianity
from statsmodels.graphics.gofplots import qqplot    # Gaussianity

# Stats
from scipy.stats import skew,kurtosis,zscore

# Machine Learning - Preprocessing the Dataset
from sklearn.preprocessing import RobustScaler      # Scaling Numerical Features
from sklearn.preprocessing import OneHotEncoder     # Encoding Categorical Variables
from sklearn.preprocessing import LabelEncoder     # Encoding the Target


# Machine Learning - "Workflow"
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn import set_config; set_config(display = "diagram")

# Machine Learning - Tools
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
import time

# Classification Metrics
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# Machine Learning Classifiers

## Classics
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

## Decision Trees
from sklearn.tree import DecisionTreeClassifier

## Random Forests
from sklearn.ensemble import RandomForestClassifier

## Bootstrap Aggregating
from sklearn.ensemble import BaggingClassifier

## Adaboost
from sklearn.ensemble import AdaBoostClassifier

## Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier

## Extreme Gradient Tree Boosting
from xgboost import XGBClassifier

In [2]:
# Suppressing Warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# "magic commands" to enable autoreload of your imported packages
%load_ext autoreload
%autoreload 2

In [4]:
train = pd.read_excel('/Users/florianlanger/code/florentiino/challenge_repo/01-predict_book_price/Participants_Data/Data_Train.xlsx')
train.head()

Unnamed: 0,Title,Author,Edition,Reviews,Ratings,Synopsis,Genre,BookCategory,Price
0,The Prisoner's Gold (The Hunters 3),Chris Kuzneski,"Paperback,– 10 Mar 2016",4.0 out of 5 stars,8 customer reviews,THE HUNTERS return in their third brilliant no...,Action & Adventure (Books),Action & Adventure,220.0
1,Guru Dutt: A Tragedy in Three Acts,Arun Khopkar,"Paperback,– 7 Nov 2012",3.9 out of 5 stars,14 customer reviews,A layered portrait of a troubled genius for wh...,Cinema & Broadcast (Books),"Biographies, Diaries & True Accounts",202.93
2,Leviathan (Penguin Classics),Thomas Hobbes,"Paperback,– 25 Feb 1982",4.8 out of 5 stars,6 customer reviews,"""During the time men live without a common Pow...",International Relations,Humour,299.0
3,A Pocket Full of Rye (Miss Marple),Agatha Christie,"Paperback,– 5 Oct 2017",4.1 out of 5 stars,13 customer reviews,A handful of grain is found in the pocket of a...,Contemporary Fiction (Books),"Crime, Thriller & Mystery",180.0
4,LIFE 70 Years of Extraordinary Photography,Editors of Life,"Hardcover,– 10 Oct 2006",5.0 out of 5 stars,1 customer review,"For seven decades, ""Life"" has been thrilling t...",Photography Textbooks,"Arts, Film & Photography",965.62


In [5]:
test = pd.read_excel('/Users/florianlanger/code/florentiino/challenge_repo/01-predict_book_price/Participants_Data/Data_Test.xlsx')
train.head()

Unnamed: 0,Title,Author,Edition,Reviews,Ratings,Synopsis,Genre,BookCategory,Price
0,The Prisoner's Gold (The Hunters 3),Chris Kuzneski,"Paperback,– 10 Mar 2016",4.0 out of 5 stars,8 customer reviews,THE HUNTERS return in their third brilliant no...,Action & Adventure (Books),Action & Adventure,220.0
1,Guru Dutt: A Tragedy in Three Acts,Arun Khopkar,"Paperback,– 7 Nov 2012",3.9 out of 5 stars,14 customer reviews,A layered portrait of a troubled genius for wh...,Cinema & Broadcast (Books),"Biographies, Diaries & True Accounts",202.93
2,Leviathan (Penguin Classics),Thomas Hobbes,"Paperback,– 25 Feb 1982",4.8 out of 5 stars,6 customer reviews,"""During the time men live without a common Pow...",International Relations,Humour,299.0
3,A Pocket Full of Rye (Miss Marple),Agatha Christie,"Paperback,– 5 Oct 2017",4.1 out of 5 stars,13 customer reviews,A handful of grain is found in the pocket of a...,Contemporary Fiction (Books),"Crime, Thriller & Mystery",180.0
4,LIFE 70 Years of Extraordinary Photography,Editors of Life,"Hardcover,– 10 Oct 2006",5.0 out of 5 stars,1 customer review,"For seven decades, ""Life"" has been thrilling t...",Photography Textbooks,"Arts, Film & Photography",965.62


In [6]:
#having numerical values for reviews
train['Reviews'] = train['Reviews'].apply(lambda x: float(x.split()[0]))
test['Reviews'] = test['Reviews'].apply(lambda x: float(x.split()[0]))

In [7]:
# having numerical values for ratings
train['Ratings'] = train['Ratings'].apply(lambda x: (x.split()[0].replace(',',''))).astype(int)
test['Ratings'] = test['Ratings'].apply(lambda x: (x.split()[0].replace(',',''))).astype(int)

In [8]:
def lowercase(df,cols):
    for col in cols:
        df[col] = df[col].str.lower()

In [9]:
# standardize categorical features 
cols = ['Title','Author','Edition','Synopsis','Genre','BookCategory']

#train dataset
lowercase(train,cols)
#teset dataset
lowercase(test,cols)

In [10]:
# making genre more clean by removing "books" and "textbooks"
train.Genre = train.Genre.apply(lambda x: x.strip('(books)').strip('Textbooks'))
test.Genre = test.Genre.apply(lambda x: x.strip('(books)').strip('Textbooks'))

In [11]:
# 'Extracting' year of publish from the Edition column
train['Edition_Year'] = train['Edition'].apply(lambda x: x.split()[-1] if x.split()[-1].isdigit() else 'na')
test['Edition_Year'] = test['Edition'].apply(lambda x: x.split()[-1] if x.split()[-1].isdigit() else 'na')

In [12]:
train.head(2)

Unnamed: 0,Title,Author,Edition,Reviews,Ratings,Synopsis,Genre,BookCategory,Price,Edition_Year
0,the prisoner's gold (the hunters 3),chris kuzneski,"paperback,– 10 mar 2016",4.0,8,the hunters return in their third brilliant no...,action & adventure,action & adventure,220.0,2016
1,guru dutt: a tragedy in three acts,arun khopkar,"paperback,– 7 nov 2012",3.9,14,a layered portrait of a troubled genius for wh...,cinema & broadcast,"biographies, diaries & true accounts",202.93,2012


In [13]:
# Numerical Pipeline

num_transformer = Pipeline([
    ("num_imputer", SimpleImputer(strategy = "median")),
    ("rb_scaler", RobustScaler())
])

num_transformer

In [14]:
# to check params uncomment below
#num_transformer.__dir__()

In [15]:
# Categorical Pipeline

cat_multi_transformer = Pipeline([
    ("multi_imputer", SimpleImputer(strategy = "most_frequent")),
    ("ohe", OneHotEncoder(sparse = False,
                         handle_unknown = "ignore"))
])

cat_multi_transformer

In [16]:
# cat_multi_transformer.__dir__()

In [17]:
# Binary Categorical Pipeline

cat_binary_transformer = Pipeline([
    ("binary_imputer", SimpleImputer(strategy = "most_frequent")),
    ("ohe_binary", OneHotEncoder(sparse = False,
                                drop = "if_binary",
                                handle_unknown = "ignore"))
])

cat_binary_transformer

In [18]:
# cat_binary_transformer.__dir__()

In [19]:
train.dtypes

Title            object
Author           object
Edition          object
Reviews         float64
Ratings           int64
Synopsis         object
Genre            object
BookCategory     object
Price           float64
Edition_Year     object
dtype: object

In [20]:
y = train.Price
y.value_counts()

299.00     108
399.00      85
449.00      59
295.00      49
319.00      48
          ... 
259.35       1
1138.00      1
129.65       1
294.25       1
2729.00      1
Name: Price, Length: 1614, dtype: int64

In [21]:
X = train.drop(columns = ["Price"])
X.head()

Unnamed: 0,Title,Author,Edition,Reviews,Ratings,Synopsis,Genre,BookCategory,Edition_Year
0,the prisoner's gold (the hunters 3),chris kuzneski,"paperback,– 10 mar 2016",4.0,8,the hunters return in their third brilliant no...,action & adventure,action & adventure,2016
1,guru dutt: a tragedy in three acts,arun khopkar,"paperback,– 7 nov 2012",3.9,14,a layered portrait of a troubled genius for wh...,cinema & broadcast,"biographies, diaries & true accounts",2012
2,leviathan (penguin classics),thomas hobbes,"paperback,– 25 feb 1982",4.8,6,"""during the time men live without a common pow...",international relation,humour,1982
3,a pocket full of rye (miss marple),agatha christie,"paperback,– 5 oct 2017",4.1,13,a handful of grain is found in the pocket of a...,contemporary fiction,"crime, thriller & mystery",2017
4,life 70 years of extraordinary photography,editors of life,"hardcover,– 10 oct 2006",5.0,1,"for seven decades, ""life"" has been thrilling t...",photography,"arts, film & photography",2006


In [22]:
X_num = X.select_dtypes(exclude = ["object"])
X_num.dtypes

Reviews    float64
Ratings      int64
dtype: object

In [23]:
X_cat = X.select_dtypes(include = ["object"])
X_cat.dtypes

Title           object
Author          object
Edition         object
Synopsis        object
Genre           object
BookCategory    object
Edition_Year    object
dtype: object

In [24]:
ohe_selection = pd.DataFrame(X_cat.nunique()).reset_index()
ohe_selection.columns = ["features", "unique_values"]
ohe_selection

Unnamed: 0,features,unique_values
0,Title,5564
1,Author,3670
2,Edition,3370
3,Synopsis,5548
4,Genre,341
5,BookCategory,11
6,Edition_Year,57


In [25]:
ohe_features = ohe_selection.query("unique_values > 2").reset_index(drop=True)["features"][0]
ohe_features

'Title'

In [26]:
ohe_features = 'BookCategory'

In [27]:
ohe = OneHotEncoder(sparse = False)
ohe.fit(X_cat[[ohe_features]])

In [28]:
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, make_column_selector(dtype_include = ["float64", "int64"])),
    ('cat_multi_transformer', cat_multi_transformer, [ohe_features])
])

preprocessor

In [29]:
#scaling numerical features

rb_scaler = RobustScaler()
X_num_scaled = pd.DataFrame(rb_scaler.fit_transform(X_num.copy()), # copy only values, not references
                            columns = X_num.columns) 
X_num_scaled.head()

Unnamed: 0,Reviews,Ratings
0,-0.5,0.05
1,-0.625,0.35
2,0.5,-0.05
3,-0.375,0.3
4,0.75,-0.3


In [30]:
# scaled numerical features
X_num_scaled.shape

(6237, 2)

⚠️ When we use Pipeline, we lose the `get_feature_names()`... 

👉 Let's not use the SimpleImputer for the moment in our transformer:

In [31]:
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, make_column_selector(dtype_include = ["float64", "int64"])),
    ('cat_multi_transformer', OneHotEncoder(sparse = False,
                         handle_unknown = "ignore"), [ohe_features])
])

preprocessor

*Let's transform the dataset using these Pipelines and ColumnTransformer !*

In [32]:
X_transformed = pd.DataFrame(preprocessor.fit_transform(X))
X_transformed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,-0.500,0.05,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.625,0.35,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.500,-0.05,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,-0.375,0.30,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.750,-0.30,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6232,0.750,-0.25,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6233,-1.375,0.10,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6234,-0.750,-0.20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
6235,-1.125,-0.15,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Getting the features' names...

✅ Actually, we could fit any ML with this transformed dataset and use for example `feature_permutation` to detect which column/feature is important or not...

❗️ But without the features' names, we are losing some interpretability...

🤔 So, how can we collect them ? 

> Investigating the different transformers in the ColumnTransformer...

In [33]:
preprocessor

In [34]:
preprocessor.transformers_

[('num_transformer',
  Pipeline(steps=[('num_imputer', SimpleImputer(strategy='median')),
                  ('rb_scaler', RobustScaler())]),
  ['Reviews', 'Ratings']),
 ('cat_multi_transformer',
  OneHotEncoder(handle_unknown='ignore', sparse=False),
  ['BookCategory']),
 ('remainder', 'drop', [0, 1, 2, 5, 6, 8])]

In [35]:
type(preprocessor.transformers_)

list

In [36]:
len(preprocessor.transformers_)

3

In [37]:
# numerical transformed features 

preprocessor.transformers_[0]

('num_transformer',
 Pipeline(steps=[('num_imputer', SimpleImputer(strategy='median')),
                 ('rb_scaler', RobustScaler())]),
 ['Reviews', 'Ratings'])

In [38]:
## numerical columns
num_columns = preprocessor.transformers_[0][-1]
num_columns

['Reviews', 'Ratings']

In [39]:
ohe_columns = preprocessor.transformers_[1][1].get_feature_names_out()
ohe_columns

array(['BookCategory_action & adventure',
       'BookCategory_arts, film & photography',
       'BookCategory_biographies, diaries & true accounts',
       'BookCategory_comics & mangas',
       'BookCategory_computing, internet & digital media',
       'BookCategory_crime, thriller & mystery', 'BookCategory_humour',
       'BookCategory_language, linguistics & writing',
       'BookCategory_politics', 'BookCategory_romance',
       'BookCategory_sports'], dtype=object)

In [40]:
#### Aggregating the different lists of features' names

In [41]:
num_columns, ohe_columns

(['Reviews', 'Ratings'],
 array(['BookCategory_action & adventure',
        'BookCategory_arts, film & photography',
        'BookCategory_biographies, diaries & true accounts',
        'BookCategory_comics & mangas',
        'BookCategory_computing, internet & digital media',
        'BookCategory_crime, thriller & mystery', 'BookCategory_humour',
        'BookCategory_language, linguistics & writing',
        'BookCategory_politics', 'BookCategory_romance',
        'BookCategory_sports'], dtype=object))

In [42]:
itertools.chain(num_columns, ohe_columns)

<itertools.chain at 0x12aec3ee0>

In [43]:
features_names_after = list(itertools.chain(num_columns, ohe_columns))
features_names_after

['Reviews',
 'Ratings',
 'BookCategory_action & adventure',
 'BookCategory_arts, film & photography',
 'BookCategory_biographies, diaries & true accounts',
 'BookCategory_comics & mangas',
 'BookCategory_computing, internet & digital media',
 'BookCategory_crime, thriller & mystery',
 'BookCategory_humour',
 'BookCategory_language, linguistics & writing',
 'BookCategory_politics',
 'BookCategory_romance',
 'BookCategory_sports']

In [44]:
X_transformed.columns = features_names_after

In [45]:
X_transformed.head()

Unnamed: 0,Reviews,Ratings,BookCategory_action & adventure,"BookCategory_arts, film & photography","BookCategory_biographies, diaries & true accounts",BookCategory_comics & mangas,"BookCategory_computing, internet & digital media","BookCategory_crime, thriller & mystery",BookCategory_humour,"BookCategory_language, linguistics & writing",BookCategory_politics,BookCategory_romance,BookCategory_sports
0,-0.5,0.05,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.625,0.35,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.5,-0.05,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,-0.375,0.3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.75,-0.3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
## `Label Encoder` the target

In [47]:
y.value_counts(normalize = False) # normalize is set to False by default

299.00     108
399.00      85
449.00      59
295.00      49
319.00      48
          ... 
259.35       1
1138.00      1
129.65       1
294.25       1
2729.00      1
Name: Price, Length: 1614, dtype: int64

In [48]:
y.value_counts(normalize = True)

299.00     0.017316
399.00     0.013628
449.00     0.009460
295.00     0.007856
319.00     0.007696
             ...   
259.35     0.000160
1138.00    0.000160
129.65     0.000160
294.25     0.000160
2729.00    0.000160
Name: Price, Length: 1614, dtype: float64

In [49]:
label_encoder = LabelEncoder()

target = pd.DataFrame(label_encoder.fit_transform(y), columns = ["target"])
target.sample(10)

Unnamed: 0,target
457,223
3145,323
6142,281
4085,166
1904,528
3950,100
125,526
2965,244
1185,1302
127,1432


In [50]:
target.value_counts(normalize = False) # normalize is set to False by default

target
297       108
457        85
526        59
291        49
326        48
         ... 
920         1
921         1
922         1
923         1
1613        1
Length: 1614, dtype: int64

In [51]:
target.value_counts(normalize = True)

target
297       0.017316
457       0.013628
526       0.009460
291       0.007856
326       0.007696
            ...   
920       0.000160
921       0.000160
922       0.000160
923       0.000160
1613      0.000160
Length: 1614, dtype: float64

In [52]:
X_transformed.head()

Unnamed: 0,Reviews,Ratings,BookCategory_action & adventure,"BookCategory_arts, film & photography","BookCategory_biographies, diaries & true accounts",BookCategory_comics & mangas,"BookCategory_computing, internet & digital media","BookCategory_crime, thriller & mystery",BookCategory_humour,"BookCategory_language, linguistics & writing",BookCategory_politics,BookCategory_romance,BookCategory_sports
0,-0.5,0.05,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.625,0.35,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.5,-0.05,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,-0.375,0.3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.75,-0.3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
## Modelling


In [54]:
### Reshuffling the entire dataset

In [55]:
df = train.copy()
df["price"] = target

In [56]:
df_reshuffled = df.sample(len(df))
df_reshuffled

Unnamed: 0,Title,Author,Edition,Reviews,Ratings,Synopsis,Genre,BookCategory,Price,Edition_Year,price
2900,"school essays, letters, paragraphs, comprehens...",madan sood,"paperback,– 1 apr 2019",3.6,5,this book encompasses a wide range of essays p...,children's language learning,"language, linguistics & writing",64.0,2019,23
4731,a summer promise,katie flynn,"paperback,– 18 jun 2015",5.0,1,from the bestselling author katie flynn. growi...,action & adventure,action & adventure,125.0,2015,78
3684,the best we could do: an illustrated memoir,thi bui,"paperback,– 17 apr 2018",5.0,4,an aba indies introduce winter/ spring 2017 se...,iographies & autobiographies,comics & mangas,967.0,2018,1061
1270,dior perfume (memoire),christine dell'amore,"hardcover,– 30 may 2013",1.0,1,the pinnacle of french haute couture and savoi...,"arts history, theory & criticism",humour,1490.0,2013,1330
3429,good night stories for rebel girls 2,francesca cavallo,"hardcover,– 10 sep 2018",4.2,30,"100 new bedtime stories, each inspired by the ...",children's reference,"biographies, diaries & true accounts",560.0,2018,683
...,...,...,...,...,...,...,...,...,...,...,...
3926,one hundred names,cecelia ahern,"paperback,– import, 18 jul 2013",4.4,17,the uplifting and captivating new novel from t...,action & adventure,action & adventure,249.0,2013,230
4429,captain underpants and the terrifying return o...,dav pilkey,"paperback,– import, 28 aug 2012",3.9,15,captain underpants returns! in their ninth epi...,comics & mangas,humour,205.0,2012,172
148,the deadly dozen: india's most notorious seria...,anirban bhattacharya,"paperback,– 10 jun 2019",5.0,8,a schoolteacher who killed multiple paramours ...,rue accounts,"crime, thriller & mystery",209.0,2019,177
78,south by java head,alistair maclean,"paperback,– special edition, 6 may 2008",4.4,7,the 50th anniversary edition of this classic w...,action & adventure,"crime, thriller & mystery",309.0,2008,312


In [57]:
## holdout

In [58]:
y = df_reshuffled["price"]
X = df_reshuffled.drop(columns = "price")

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)

In [60]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4365, 10), (1872, 10), (4365,), (1872,))

In [61]:
round(y.value_counts(normalize = True),2)

297     0.02
457     0.01
526     0.01
291     0.01
326     0.01
        ... 
1583    0.00
1168    0.00
982     0.00
567     0.00
1406    0.00
Name: price, Length: 1614, dtype: float64

In [62]:
round(y_train.value_counts(normalize = True),2)

297     0.02
457     0.01
526     0.01
326     0.01
291     0.01
        ... 
1496    0.00
501     0.00
1329    0.00
754     0.00
1296    0.00
Name: price, Length: 1355, dtype: float64

In [63]:
round(y_test.value_counts(normalize = True),2)

297     0.02
457     0.01
164     0.01
312     0.01
526     0.01
        ... 
1498    0.00
1261    0.00
303     0.00
73      0.00
547     0.00
Name: price, Length: 854, dtype: float64

**Chaining model to pipeline**

In [64]:
from sklearn.linear_model import Ridge

# Combine preprocessor and Ridge regressor in pipeline
pipe_baseline = Pipeline([
    ('preprocessing', preprocessor),
    ('linear_regression', Ridge())])
pipe_baseline

In [65]:
from sklearn.metrics import mean_squared_log_error, make_scorer

def root_mean_squared_log_error(y_true, y_pred):
    t = np.array(y_true)
    p = np.array(y_pred)
    log_error = np.log(1+t) - np.log(1+p)
    return ((log_error**2).mean())**0.5

# Minimize this one 
rmsle = make_scorer(root_mean_squared_log_error)

# Maximize this one
rmsle_neg = make_scorer(lambda y_true, y_pred: -1 * root_mean_squared_log_error(y_true, y_pred))

In [66]:
pipe_baseline.fit(X_train, np.array(y_train).ravel())

In [67]:
predictions = pipe_baseline.predict(X_test)
predictions

array([444.43714833, 865.68202134, 348.35738677, ..., 400.45524457,
       685.60976207, 457.42738003])

In [70]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

score_baseline = cross_validate(pipe_baseline, X, y, cv=5, scoring = rmsle, n_jobs = -1)["test_score"].mean()
score_baseline

nan

In [75]:
cv_results = cross_validate(pipe_baseline,
                           X_train,
                           y_train,
                           cv = 5,
                           n_jobs = -1,
                           scoring = "r2")

In [77]:
cv_results['test_score'].mean()

0.600283582323131

In [71]:
pipe_baseline.fit(X,y)
y_pred_baseline = pipe_baseline.predict(X_test)
y_pred_baseline

array([420.080638  , 861.3489948 , 347.26341234, ..., 403.54718813,
       688.56140793, 461.73116224])

> **Chosing a Model**

In [79]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

In [82]:
rfr = RandomForestRegressor()
gbr = GradientBoostingRegressor()
dtr = DecisionTreeRegressor()

In [83]:
competitors_names = ["rdforest",
                     "gradBoost",
                     "Tree"]

In [84]:
competitors = [rfr,gbr,dtr]

In [103]:
from sklearn.metrics import accuracy_score,mean_squared_error,r2_score

In [104]:
%%time

models = []
train_mse = []
test_mse = []
train_r2 = []
test_r2 = []
computational_times = []

for model in competitors:
    
    pipelined_model = make_pipeline(
        preprocessor,
        model
    )
    start_time = time.time()
    
    pipelined_model.fit(X_train, np.array(y_train).ravel())

    pipelined_model_predictions_train = pipelined_model.predict(X_train)
    pipelined_model_mse_score_train = mean_squared_error(np.array(y_train).ravel(),pipelined_model_predictions_train)
    pipelined_model_r2_score_train = r2_score(np.array(y_train).ravel(),pipelined_model_predictions_train)
    
    pipelined_model_predictions_test = pipelined_model.predict(X_test)
    pipelined_model_mse_score_test = mean_squared_error(np.array(y_test).ravel(),pipelined_model_predictions_test)
    pipelined_model_r2_score_test = r2_score(np.array(y_test).ravel(),pipelined_model_predictions_test)
    
    end_time = time.time()
    computational_time = end_time - start_time
    
    models.append(model)
    train_mse.append(pipelined_model_mse_score_train)
    test_mse.append(pipelined_model_mse_score_test)
    train_r2.append(pipelined_model_mse_score_train)
    test_r2.append(pipelined_model_mse_score_test)
    computational_times.append(computational_time)

CPU times: user 1.37 s, sys: 55.8 ms, total: 1.43 s
Wall time: 1.68 s


In [114]:
test_mse

[0.27140176282051137, 7.676267766426039, 0.5048076923076923]

In [122]:
pd.Series(train_mse)

0    0.039190
1    6.990613
2    0.000000
dtype: float64

In [116]:
computational_times

[1.0513601303100586, 0.5753262042999268, 0.05078721046447754]

In [132]:
summary_df = pd.concat([pd.Series(competitors_names),
                      pd.Series(test_mse),
                       pd.Series(train_mse),
                       pd.Series(test_r2),
                       pd.Series(train_r2),
                       pd.Series(computational_times)], axis =1)

summary_df.columns = ["competitors","test_mse",'train_mse','test_r2','train_r2','comp_time']

summary_df

round(summary_df.sort_values(by = "test_r2", ascending = False),2)

Unnamed: 0,competitors,test_mse,train_mse,test_r2,train_r2,comp_time
1,gradBoost,7.68,6.99,7.68,6.99,0.58
2,Tree,0.5,0.0,0.5,0.0,0.05
0,rdforest,0.27,0.04,0.27,0.04,1.05


In [97]:
pipe_gradBoost = Pipeline([
    ('preprocessing', preprocessor),
    ('linear_regression', GradientBoostingRegressor())])
pipe_gradBoost

In [98]:
pipe_gradBoost.fit(X,y)
y_pred_gradBoost = pipe_gradBoost.predict(X_test)
y_pred_gradBoost

array([ 160.16836774, 1133.18757075,  202.91428414, ...,  399.38391909,
        989.84379652,  540.18511569])

In [99]:
pipe_Tree = Pipeline([
    ('preprocessing', preprocessor),
    ('linear_regression',DecisionTreeRegressor())
])

In [100]:
pipe_Tree

In [101]:
pipe_Tree.fit(X,y)
y_pred_Tree = pipe_Tree.predict(X_test)
y_pred_Tree

array([ 158., 1134.,  203., ...,  399.,  990.,  547.])