In [1]:
# Data Manipulation
import numpy as np
import pandas as pd

# Data Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Lists' manipulations
import itertools

# Gaussianity
from statsmodels.graphics.gofplots import qqplot    # Gaussianity

# Stats
from scipy.stats import skew,kurtosis,zscore

# Machine Learning - Preprocessing the Dataset
from sklearn.preprocessing import RobustScaler      # Scaling Numerical Features
from sklearn.preprocessing import OneHotEncoder     # Encoding Categorical Variables
from sklearn.preprocessing import LabelEncoder     # Encoding the Target


# Machine Learning - "Workflow"
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn import set_config; set_config(display = "diagram")

# Machine Learning - Tools
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
import time

# Classification Metrics
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# Machine Learning Classifiers

## Classics
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

## Decision Trees
from sklearn.tree import DecisionTreeClassifier

## Random Forests
from sklearn.ensemble import RandomForestClassifier

## Bootstrap Aggregating
from sklearn.ensemble import BaggingClassifier

## Adaboost
from sklearn.ensemble import AdaBoostClassifier

## Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier

## Extreme Gradient Tree Boosting
from xgboost import XGBClassifier

In [2]:
# Suppressing Warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# "magic commands" to enable autoreload of your imported packages
%load_ext autoreload
%autoreload 2

In [4]:
train = pd.read_excel('/Users/florianlanger/code/florentiino/competitions/01-predict_book_price/Participants_Data/Data_Train.xlsx')
train.head()

Unnamed: 0,Title,Author,Edition,Reviews,Ratings,Synopsis,Genre,BookCategory,Price
0,The Prisoner's Gold (The Hunters 3),Chris Kuzneski,"Paperback,– 10 Mar 2016",4.0 out of 5 stars,8 customer reviews,THE HUNTERS return in their third brilliant no...,Action & Adventure (Books),Action & Adventure,220.0
1,Guru Dutt: A Tragedy in Three Acts,Arun Khopkar,"Paperback,– 7 Nov 2012",3.9 out of 5 stars,14 customer reviews,A layered portrait of a troubled genius for wh...,Cinema & Broadcast (Books),"Biographies, Diaries & True Accounts",202.93
2,Leviathan (Penguin Classics),Thomas Hobbes,"Paperback,– 25 Feb 1982",4.8 out of 5 stars,6 customer reviews,"""During the time men live without a common Pow...",International Relations,Humour,299.0
3,A Pocket Full of Rye (Miss Marple),Agatha Christie,"Paperback,– 5 Oct 2017",4.1 out of 5 stars,13 customer reviews,A handful of grain is found in the pocket of a...,Contemporary Fiction (Books),"Crime, Thriller & Mystery",180.0
4,LIFE 70 Years of Extraordinary Photography,Editors of Life,"Hardcover,– 10 Oct 2006",5.0 out of 5 stars,1 customer review,"For seven decades, ""Life"" has been thrilling t...",Photography Textbooks,"Arts, Film & Photography",965.62


In [5]:
test = pd.read_excel('/Users/florianlanger/code/florentiino/competitions/01-predict_book_price/Participants_Data/Data_Test.xlsx')
train.head()

Unnamed: 0,Title,Author,Edition,Reviews,Ratings,Synopsis,Genre,BookCategory,Price
0,The Prisoner's Gold (The Hunters 3),Chris Kuzneski,"Paperback,– 10 Mar 2016",4.0 out of 5 stars,8 customer reviews,THE HUNTERS return in their third brilliant no...,Action & Adventure (Books),Action & Adventure,220.0
1,Guru Dutt: A Tragedy in Three Acts,Arun Khopkar,"Paperback,– 7 Nov 2012",3.9 out of 5 stars,14 customer reviews,A layered portrait of a troubled genius for wh...,Cinema & Broadcast (Books),"Biographies, Diaries & True Accounts",202.93
2,Leviathan (Penguin Classics),Thomas Hobbes,"Paperback,– 25 Feb 1982",4.8 out of 5 stars,6 customer reviews,"""During the time men live without a common Pow...",International Relations,Humour,299.0
3,A Pocket Full of Rye (Miss Marple),Agatha Christie,"Paperback,– 5 Oct 2017",4.1 out of 5 stars,13 customer reviews,A handful of grain is found in the pocket of a...,Contemporary Fiction (Books),"Crime, Thriller & Mystery",180.0
4,LIFE 70 Years of Extraordinary Photography,Editors of Life,"Hardcover,– 10 Oct 2006",5.0 out of 5 stars,1 customer review,"For seven decades, ""Life"" has been thrilling t...",Photography Textbooks,"Arts, Film & Photography",965.62


In [7]:
#having numerical values for reviews
train['Reviews'] = train['Reviews'].apply(lambda x: float(x.split()[0]))
test['Reviews'] = test['Reviews'].apply(lambda x: float(x.split()[0]))

In [8]:
# having numerical values for ratings
train['Ratings'] = train['Ratings'].apply(lambda x: (x.split()[0].replace(',',''))).astype(int)
test['Ratings'] = test['Ratings'].apply(lambda x: (x.split()[0].replace(',',''))).astype(int)

In [9]:
def lowercase(df,cols):
    for col in cols:
        df[col] = df[col].str.lower()

In [10]:
# standardize categorical features 
cols = ['Title','Author','Edition','Synopsis','Genre','BookCategory']

#train dataset
lowercase(train,cols)
#teset dataset
lowercase(test,cols)

In [11]:
# making genre more clean by removing "books" and "textbooks"
train.Genre = train.Genre.apply(lambda x: x.strip('(books)').strip('Textbooks'))
test.Genre = test.Genre.apply(lambda x: x.strip('(books)').strip('Textbooks'))

In [12]:
# 'Extracting' year of publish from the Edition column
train['Edition_Year'] = train['Edition'].apply(lambda x: x.split()[-1] if x.split()[-1].isdigit() else 'na')
test['Edition_Year'] = test['Edition'].apply(lambda x: x.split()[-1] if x.split()[-1].isdigit() else 'na')

In [13]:
train.head(2)

Unnamed: 0,Title,Author,Edition,Reviews,Ratings,Synopsis,Genre,BookCategory,Price,Edition_Year
0,the prisoner's gold (the hunters 3),chris kuzneski,"paperback,– 10 mar 2016",4.0,8,the hunters return in their third brilliant no...,action & adventure,action & adventure,220.0,2016
1,guru dutt: a tragedy in three acts,arun khopkar,"paperback,– 7 nov 2012",3.9,14,a layered portrait of a troubled genius for wh...,cinema & broadcast,"biographies, diaries & true accounts",202.93,2012


In [14]:
# Numerical Pipeline

num_transformer = Pipeline([
    ("num_imputer", SimpleImputer(strategy = "median")),
    ("rb_scaler", RobustScaler())
])

num_transformer

In [16]:
# to check params uncomment below
#num_transformer.__dir__()

In [17]:
# Categorical Pipeline

cat_multi_transformer = Pipeline([
    ("multi_imputer", SimpleImputer(strategy = "most_frequent")),
    ("ohe", OneHotEncoder(sparse = False,
                         handle_unknown = "ignore"))
])

cat_multi_transformer

In [18]:
# cat_multi_transformer.__dir__()

In [19]:
# Binary Categorical Pipeline

cat_binary_transformer = Pipeline([
    ("binary_imputer", SimpleImputer(strategy = "most_frequent")),
    ("ohe_binary", OneHotEncoder(sparse = False,
                                drop = "if_binary",
                                handle_unknown = "ignore"))
])

cat_binary_transformer

In [20]:
# cat_binary_transformer.__dir__()

In [21]:
train.dtypes

Title            object
Author           object
Edition          object
Reviews         float64
Ratings           int64
Synopsis         object
Genre            object
BookCategory     object
Price           float64
Edition_Year     object
dtype: object

In [24]:
y = train.Price
y.value_counts()

299.00     108
399.00      85
449.00      59
295.00      49
319.00      48
          ... 
259.35       1
1138.00      1
129.65       1
294.25       1
2729.00      1
Name: Price, Length: 1614, dtype: int64

In [72]:
X = train.drop(columns = ["Price"])
X.head()

Unnamed: 0,Title,Author,Edition,Reviews,Ratings,Synopsis,Genre,BookCategory,Edition_Year
0,the prisoner's gold (the hunters 3),chris kuzneski,"paperback,– 10 mar 2016",4.0,8,the hunters return in their third brilliant no...,action & adventure,action & adventure,2016
1,guru dutt: a tragedy in three acts,arun khopkar,"paperback,– 7 nov 2012",3.9,14,a layered portrait of a troubled genius for wh...,cinema & broadcast,"biographies, diaries & true accounts",2012
2,leviathan (penguin classics),thomas hobbes,"paperback,– 25 feb 1982",4.8,6,"""during the time men live without a common pow...",international relation,humour,1982
3,a pocket full of rye (miss marple),agatha christie,"paperback,– 5 oct 2017",4.1,13,a handful of grain is found in the pocket of a...,contemporary fiction,"crime, thriller & mystery",2017
4,life 70 years of extraordinary photography,editors of life,"hardcover,– 10 oct 2006",5.0,1,"for seven decades, ""life"" has been thrilling t...",photography,"arts, film & photography",2006


In [73]:
X_num = X.select_dtypes(exclude = ["object"])
X_num.dtypes

Reviews    float64
Ratings      int64
dtype: object

In [74]:
X_cat = X.select_dtypes(include = ["object"])
X_cat.dtypes

Title           object
Author          object
Edition         object
Synopsis        object
Genre           object
BookCategory    object
Edition_Year    object
dtype: object

In [106]:
ohe_selection = pd.DataFrame(X_cat.nunique()).reset_index()
ohe_selection.columns = ["features", "unique_values"]
ohe_selection

Unnamed: 0,features,unique_values
0,Title,5564
1,Author,3670
2,Edition,3370
3,Synopsis,5548
4,Genre,341
5,BookCategory,11
6,Edition_Year,57


In [108]:
ohe_features = ohe_selection.query("unique_values > 2").reset_index(drop=True)["features"][0]
ohe_features

'Title'

In [109]:
ohe_features = 'BookCategory'

In [110]:
ohe = OneHotEncoder(sparse = False)
ohe.fit(X_cat[[ohe_features]])

In [111]:
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, make_column_selector(dtype_include = ["float64", "int64"])),
    ('cat_multi_transformer', cat_multi_transformer, [ohe_features])
])

preprocessor

In [112]:
#scaling numerical features

rb_scaler = RobustScaler()
X_num_scaled = pd.DataFrame(rb_scaler.fit_transform(X_num.copy()), # copy only values, not references
                            columns = X_num.columns) 
X_num_scaled.head()

Unnamed: 0,Reviews,Ratings
0,-0.5,0.05
1,-0.625,0.35
2,0.5,-0.05
3,-0.375,0.3
4,0.75,-0.3


In [113]:
# scaled numerical features
X_num_scaled.shape

(6237, 2)

⚠️ When we use Pipeline, we lose the `get_feature_names()`... 

👉 Let's not use the SimpleImputer for the moment in our transformer:

In [114]:
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, make_column_selector(dtype_include = ["float64", "int64"])),
    ('cat_multi_transformer', OneHotEncoder(sparse = False,
                         handle_unknown = "ignore"), [ohe_features])
])

preprocessor

*Let's transform the dataset using these Pipelines and ColumnTransformer !*

In [115]:
X_transformed = pd.DataFrame(preprocessor.fit_transform(X))
X_transformed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,-0.500,0.05,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.625,0.35,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.500,-0.05,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,-0.375,0.30,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.750,-0.30,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6232,0.750,-0.25,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6233,-1.375,0.10,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6234,-0.750,-0.20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
6235,-1.125,-0.15,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Getting the features' names...

✅ Actually, we could fit any ML with this transformed dataset and use for example `feature_permutation` to detect which column/feature is important or not...

❗️ But without the features' names, we are losing some interpretability...

🤔 So, how can we collect them ? 

> Investigating the different transformers in the ColumnTransformer...

In [116]:
preprocessor

In [117]:
preprocessor.transformers_

[('num_transformer',
  Pipeline(steps=[('num_imputer', SimpleImputer(strategy='median')),
                  ('rb_scaler', RobustScaler())]),
  ['Reviews', 'Ratings']),
 ('cat_multi_transformer',
  OneHotEncoder(handle_unknown='ignore', sparse=False),
  ['BookCategory']),
 ('remainder', 'drop', [0, 1, 2, 5, 6, 8])]

In [118]:
type(preprocessor.transformers_)

list

In [119]:
len(preprocessor.transformers_)

3

In [120]:
# numerical transformed features 

preprocessor.transformers_[0]

('num_transformer',
 Pipeline(steps=[('num_imputer', SimpleImputer(strategy='median')),
                 ('rb_scaler', RobustScaler())]),
 ['Reviews', 'Ratings'])

In [121]:
## numerical columns
num_columns = preprocessor.transformers_[0][-1]
num_columns

['Reviews', 'Ratings']

In [122]:
ohe_columns = preprocessor.transformers_[1][1].get_feature_names_out()
ohe_columns

array(['BookCategory_action & adventure',
       'BookCategory_arts, film & photography',
       'BookCategory_biographies, diaries & true accounts',
       'BookCategory_comics & mangas',
       'BookCategory_computing, internet & digital media',
       'BookCategory_crime, thriller & mystery', 'BookCategory_humour',
       'BookCategory_language, linguistics & writing',
       'BookCategory_politics', 'BookCategory_romance',
       'BookCategory_sports'], dtype=object)

In [133]:
#### Aggregating the different lists of features' names

In [123]:
num_columns, ohe_columns

(['Reviews', 'Ratings'],
 array(['BookCategory_action & adventure',
        'BookCategory_arts, film & photography',
        'BookCategory_biographies, diaries & true accounts',
        'BookCategory_comics & mangas',
        'BookCategory_computing, internet & digital media',
        'BookCategory_crime, thriller & mystery', 'BookCategory_humour',
        'BookCategory_language, linguistics & writing',
        'BookCategory_politics', 'BookCategory_romance',
        'BookCategory_sports'], dtype=object))

In [124]:
itertools.chain(num_columns, ohe_columns)

<itertools.chain at 0x1367cbf40>

In [125]:
features_names_after = list(itertools.chain(num_columns, ohe_columns))
features_names_after

['Reviews',
 'Ratings',
 'BookCategory_action & adventure',
 'BookCategory_arts, film & photography',
 'BookCategory_biographies, diaries & true accounts',
 'BookCategory_comics & mangas',
 'BookCategory_computing, internet & digital media',
 'BookCategory_crime, thriller & mystery',
 'BookCategory_humour',
 'BookCategory_language, linguistics & writing',
 'BookCategory_politics',
 'BookCategory_romance',
 'BookCategory_sports']

In [126]:
X_transformed.columns = features_names_after

In [127]:
X_transformed.head()

Unnamed: 0,Reviews,Ratings,BookCategory_action & adventure,"BookCategory_arts, film & photography","BookCategory_biographies, diaries & true accounts",BookCategory_comics & mangas,"BookCategory_computing, internet & digital media","BookCategory_crime, thriller & mystery",BookCategory_humour,"BookCategory_language, linguistics & writing",BookCategory_politics,BookCategory_romance,BookCategory_sports
0,-0.5,0.05,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.625,0.35,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.5,-0.05,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,-0.375,0.3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.75,-0.3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [134]:
## `Label Encoder` the target

In [135]:
y.value_counts(normalize = False) # normalize is set to False by default

299.00     108
399.00      85
449.00      59
295.00      49
319.00      48
          ... 
259.35       1
1138.00      1
129.65       1
294.25       1
2729.00      1
Name: Price, Length: 1614, dtype: int64

In [136]:
y.value_counts(normalize = True)

299.00     0.017316
399.00     0.013628
449.00     0.009460
295.00     0.007856
319.00     0.007696
             ...   
259.35     0.000160
1138.00    0.000160
129.65     0.000160
294.25     0.000160
2729.00    0.000160
Name: Price, Length: 1614, dtype: float64

In [137]:
label_encoder = LabelEncoder()

target = pd.DataFrame(label_encoder.fit_transform(y), columns = ["target"])
target.sample(10)

Unnamed: 0,target
1249,531
5419,521
1126,457
4991,638
3112,502
1050,865
1298,490
2710,141
1713,148
2376,251


In [138]:
target.value_counts(normalize = False) # normalize is set to False by default

target
297       108
457        85
526        59
291        49
326        48
         ... 
920         1
921         1
922         1
923         1
1613        1
Length: 1614, dtype: int64

In [139]:
target.value_counts(normalize = True)

target
297       0.017316
457       0.013628
526       0.009460
291       0.007856
326       0.007696
            ...   
920       0.000160
921       0.000160
922       0.000160
923       0.000160
1613      0.000160
Length: 1614, dtype: float64

In [142]:
X_transformed.head()

Unnamed: 0,Reviews,Ratings,BookCategory_action & adventure,"BookCategory_arts, film & photography","BookCategory_biographies, diaries & true accounts",BookCategory_comics & mangas,"BookCategory_computing, internet & digital media","BookCategory_crime, thriller & mystery",BookCategory_humour,"BookCategory_language, linguistics & writing",BookCategory_politics,BookCategory_romance,BookCategory_sports
0,-0.5,0.05,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.625,0.35,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.5,-0.05,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,-0.375,0.3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.75,-0.3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [143]:
## Modelling


In [144]:
### Reshuffling the entire dataset

In [145]:
df = train.copy()
df["price"] = target

In [146]:
df_reshuffled = df.sample(len(df))
df_reshuffled

Unnamed: 0,Title,Author,Edition,Reviews,Ratings,Synopsis,Genre,BookCategory,Price,Edition_Year,price
3179,asterix and the class act: album 32,albert uderzo,"paperback,– 18 nov 2004",3.1,6,vintage asterx!\n14 new stories including tale...,manga,comics & mangas,449.0,2004,526
3263,ronin: the deluxe edition,"frank miller, frank miler","hardcover,– 28 oct 2014",4.0,1,"ronin is the acclaimed epic by frank miller, t...",comics & mangas,comics & mangas,590.0,2014,722
3995,brazen: rebel ladies who rocked the world,pénélope bagieu,"hardcover,– 8 mar 2018",5.0,3,'pénélope bagieu … is a kind of genius. this b...,comics & mangas,humour,591.0,2018,723
3971,krishna gopeshvara: the truth of vrishnis (boo...,sanjay dixit,"paperback,– 29 may 2018",4.5,45,a never-before action packed retelling of lord...,action & adventure,action & adventure,239.0,2018,217
1548,harry potter hogwarts castle and sticker book ...,running press,"paperback,– import, 2 oct 2018",5.0,1,a one-of-a-kind miniature replica of the hogwa...,film & television,"arts, film & photography",749.0,2018,888
...,...,...,...,...,...,...,...,...,...,...,...
5565,man vs ocean: one man's journey to swim the se...,adam walker,"paperback,– import, 5 jan 2017",5.0,1,"in 2007, adam, then a toaster salesman, was in...",iographies & autobiographies,sports,713.0,2017,856
3200,my princely colleges,harkishan lal dutt,"paperback,– import, 22 may 2019",5.0,1,has your life ever been a proverbial bed of ro...,iographies & autobiographies,"biographies, diaries & true accounts",450.0,2019,531
3859,guinness world records: gamer's edition 2019,guinness world records,"paperback,– 28 aug 2018",2.0,1,the guinness world records gamer’s edition 201...,"children's games, toys & activities","computing, internet & digital media",848.0,2018,968
5879,my journey: transforming dreams into actions,a.p.j. abdul kalam,"paperback,– 27 aug 2013",4.7,615,"the book, ‘my journey: transforming dreams int...",asian history,"biographies, diaries & true accounts",167.0,2013,125


In [147]:
## holdout

In [148]:
y = df_reshuffled["price"]
X = df_reshuffled.drop(columns = "price")

In [149]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)

In [150]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4365, 10), (1872, 10), (4365,), (1872,))

In [151]:
round(y.value_counts(normalize = True),2)

297    0.02
457    0.01
526    0.01
291    0.01
326    0.01
       ... 
954    0.00
577    0.00
416    0.00
932    0.00
968    0.00
Name: price, Length: 1614, dtype: float64

In [152]:
round(y_train.value_counts(normalize = True),2)

297     0.02
457     0.01
197     0.01
526     0.01
326     0.01
        ... 
1518    0.00
1136    0.00
1318    0.00
1270    0.00
914     0.00
Name: price, Length: 1352, dtype: float64

In [153]:
round(y_test.value_counts(normalize = True),2)

297     0.02
457     0.01
526     0.01
609     0.01
312     0.01
        ... 
1250    0.00
1082    0.00
467     0.00
235     0.00
1018    0.00
Name: price, Length: 846, dtype: float64

**Chaining model to pipeline**

In [158]:
from sklearn.linear_model import Ridge

# Combine preprocessor and Ridge regressor in pipeline
pipe_baseline = Pipeline([
    ('preprocessing', preprocessor),
    ('linear_regression', Ridge())])
pipe_baseline

In [169]:
from sklearn.metrics import mean_squared_log_error, make_scorer

def root_mean_squared_log_error(y_true, y_pred):
    t = np.array(y_true)
    p = np.array(y_pred)
    log_error = np.log(1+t) - np.log(1+p)
    return ((log_error**2).mean())**0.5

# Minimize this one 
rmsle = make_scorer(root_mean_squared_log_error)

# Maximize this one
rmsle_neg = make_scorer(lambda y_true, y_pred: -1 * root_mean_squared_log_error(y_true, y_pred))

In [178]:
pipe_baseline.fit(X_train, np.array(y_train).ravel())

In [179]:
predictions = pipe_baseline.predict(X_test)
predictions

array([961.66962951, 221.80992114, 420.49625184, ..., 457.96892993,
       370.81402847, 609.71168855])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Traceback (most recent call last):
  File "/Users/florianlanger/.pyenv/versions/3.8.6/envs/lewagon/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/florianlanger/.pyenv/versions/3.8.6/envs/lewagon/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/Users/florianlanger/.pyenv/versions/3.8.6/envs/lewagon/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/