# Modelos

## 1. Regresión Lineal

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

from sklearn.decomposition import PCA

from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error

from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from scipy.stats import uniform

from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_absolute_error

from sklearn.tree import DecisionTreeRegressor

from sklearn.svm import SVC, SVR, LinearSVR

from sklearn.naive_bayes import GaussianNB

pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('Wine_reviews_climate_prediction.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70030 entries, 0 to 70029
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      70030 non-null  int64  
 1   country         70030 non-null  object 
 2   description     70030 non-null  object 
 3   points          70030 non-null  int64  
 4   price           70030 non-null  float64
 5   taster_name     70030 non-null  object 
 6   variety         70030 non-null  object 
 7   winery          70030 non-null  object 
 8   Year            70030 non-null  int64  
 9   region          70030 non-null  object 
 10  Latitude        70030 non-null  float64
 11  Longitude       70030 non-null  float64
 12  Lat_x           70030 non-null  float64
 13  Long_x          70030 non-null  float64
 14  temp_anual      70030 non-null  float64
 15  temp_max_anual  70030 non-null  float64
 16  temp_min_anual  70030 non-null  float64
 17  pre_anual       70030 non-null 

In [4]:
df.isnull().sum()

Unnamed: 0        0
country           0
description       0
points            0
price             0
taster_name       0
variety           0
winery            0
Year              0
region            0
Latitude          0
Longitude         0
Lat_x             0
Long_x            0
temp_anual        0
temp_max_anual    0
temp_min_anual    0
pre_anual         0
etp_anual         0
dtype: int64

In [5]:
#Deleteing rows with price = 0
df.dropna(subset = ['price'], inplace=True)
df.isnull().sum()

Unnamed: 0        0
country           0
description       0
points            0
price             0
taster_name       0
variety           0
winery            0
Year              0
region            0
Latitude          0
Longitude         0
Lat_x             0
Long_x            0
temp_anual        0
temp_max_anual    0
temp_min_anual    0
pre_anual         0
etp_anual         0
dtype: int64

In [6]:
df.drop(columns=['description', 'variety', 'winery', 'Latitude', 'Longitude', 'region', 'Unnamed: 0'], inplace=True)
df

Unnamed: 0,country,points,price,taster_name,Year,Lat_x,Long_x,temp_anual,temp_max_anual,temp_min_anual,pre_anual,etp_anual
0,Portugal,87,15.0,Roger Voss,2011,41.75,-5.75,13.01,19.44,6.61,388.5,1200
1,Portugal,87,15.0,Roger Voss,2011,41.75,-5.75,13.01,19.44,6.61,388.5,1200
2,Portugal,87,17.0,Roger Voss,2011,41.75,-5.75,13.01,19.44,6.61,388.5,1200
3,Portugal,91,12.0,Roger Voss,2011,41.75,-5.75,13.01,19.44,6.61,388.5,1200
4,Portugal,87,8.0,Roger Voss,2011,41.75,-5.75,13.01,19.44,6.61,388.5,1200
...,...,...,...,...,...,...,...,...,...,...,...,...
70025,USA,84,25.0,Matt Kettmann,2012,34.75,-118.25,13.96,20.23,7.75,325.0,1176
70026,Switzerland,90,21.0,Jeff Jenssen,2012,46.75,6.75,9.15,13.46,4.87,1382.5,696
70027,France,89,14.0,Roger Voss,2012,45.25,6.25,5.61,9.20,2.05,1386.2,714
70028,France,89,18.0,Roger Voss,2012,45.75,5.75,10.71,15.25,6.22,976.2,807


In [7]:
df = pd.get_dummies(df, columns=['country', 'taster_name'])

In [8]:
data = df

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 70030 entries, 0 to 70029
Data columns (total 70 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   points                          70030 non-null  int64  
 1   price                           70030 non-null  float64
 2   Year                            70030 non-null  int64  
 3   Lat_x                           70030 non-null  float64
 4   Long_x                          70030 non-null  float64
 5   temp_anual                      70030 non-null  float64
 6   temp_max_anual                  70030 non-null  float64
 7   temp_min_anual                  70030 non-null  float64
 8   pre_anual                       70030 non-null  float64
 9   etp_anual                       70030 non-null  int64  
 10  country_Argentina               70030 non-null  uint8  
 11  country_Australia               70030 non-null  uint8  
 12  country_Austria                 

In [10]:
#First we separate our info, the X will be all the columns except the output variable, then the output variable, the test size 20% and the random state to have always the same output
# is like X, X_test, y, y_test = train_test_split(variables_X, variable_Y, test_size, random_state)
X, X_test, y, y_test = train_test_split(data.drop(columns='points'), data.points, test_size=0.2, random_state=42)

In [11]:
%%time
#We create the Pipeline. First step, Normalize data, include the Polynomimal 
#(include bias false to remove from the Polynomial for X[0]=1) and then use the Ridge method
ridge_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(include_bias=False)),
    ('ridge', Ridge())
])

#In a dictionary, Step name _ _ and then parameter name, and then : with the type to do
dists = {'ridge__alpha' : uniform(loc=0, scale=200), 'poly__degree' : [1, 2]}
#Cross Validation (CV uses the n parts (1/5) for cv=5, n_inter is the number of models, )
ridge_CV = RandomizedSearchCV(ridge_pipe, param_distributions=dists, n_iter=30, cv=5)
#Now we train the model
ridge_CV.fit(X, y)

Wall time: 4min 49s


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                             ('poly',
                                              PolynomialFeatures(include_bias=False)),
                                             ('ridge', Ridge())]),
                   n_iter=30,
                   param_distributions={'poly__degree': [1, 2],
                                        'ridge__alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001BF44E6F100>})

In [12]:
ridge_CV.best_params_

{'poly__degree': 2, 'ridge__alpha': 99.32233104670432}

In [13]:
ridge_CV.score(X, y)

0.3825548667084677

In [14]:
ridge_CV.score(X_test, y_test)

0.3787943726233015

In [15]:
mean_absolute_error(ridge_CV.predict(X), y)

1.9608449459539445

In [16]:
mean_absolute_error(ridge_CV.predict(X_test), y_test)

1.9834436640957853

In [17]:
%%time
#Now we try a Tree Regressor
tree = DecisionTreeRegressor()

#In a dictionary, Step name _ _ and then parameter name, and then : with the type to do
param_grid = {'max_depth' : [10, 11, 12, 13, 14, 15], 'min_samples_leaf' : [47, 48, 49, 50, 51, 52, 53]}

#Cross Validation (CV uses the n parts (1/5) for cv=5, n_inter is the number of models, )
tree_CV = GridSearchCV(tree, param_grid=param_grid)
#Now we train the model
tree_CV.fit(X, y)

Wall time: 57.3 s


GridSearchCV(estimator=DecisionTreeRegressor(),
             param_grid={'max_depth': [10, 11, 12, 13, 14, 15],
                         'min_samples_leaf': [47, 48, 49, 50, 51, 52, 53]})

In [18]:
print("The best Parameters are :", tree_CV.best_params_)
print("The best X,y score is: ", tree_CV.score(X, y))
print("The best X,y test score is: ", tree_CV.score(X_test, y_test))
print("The MAE X,y is: ", mean_absolute_error(tree_CV.predict(X), y))
print("The MAE test X,y is: ", mean_absolute_error(tree_CV.predict(X_test), y_test))

The best Parameters are : {'max_depth': 11, 'min_samples_leaf': 48}
The best X,y score is:  0.4729605577155309
The best X,y test score is:  0.43559249110262077
The MAE X,y is:  1.8068281111728506
The MAE test X,y is:  1.8703458380744113


In [19]:
%%time
#We create the Pipeline. First step, Normalize data, include the Polynomimal 
#(include bias false to remove from the Polynomial for X[0]=1) and then use the Ridge method
ridge_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('ridge', Ridge())
])

#In a dictionary, Step name _ _ and then parameter name, and then : with the type to do
dists = {'ridge__alpha' : uniform(loc=0, scale=200), 'pca__n_components' : [10, 20, 30, 40, 60]}
#Cross Validation (CV uses the n parts (1/5) for cv=5, n_inter is the number of models, )
ridge_CV = RandomizedSearchCV(ridge_pipe, param_distributions=dists, n_iter=30, cv=5)
#Now we train the model
ridge_CV.fit(X, y)

Wall time: 52.1 s


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                             ('pca', PCA()),
                                             ('ridge', Ridge())]),
                   n_iter=30,
                   param_distributions={'pca__n_components': [10, 20, 30, 40,
                                                              60],
                                        'ridge__alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001BF49EC5CA0>})

In [20]:
ridge_CV.best_params_

{'pca__n_components': 60, 'ridge__alpha': 196.71672820104513}

In [21]:
ridge_CV.score(X, y)

0.25488111208145026

In [22]:
ridge_CV.score(X_test, y_test)

0.27601330436253724

In [23]:
mean_absolute_error(ridge_CV.predict(X), y)

2.1528510221858674

In [24]:
mean_absolute_error(ridge_CV.predict(X_test), y_test)

2.1635635778523583

In [25]:
%%time
#We create the Pipeline. First step, Normalize data, include the Polynomimal 
#(include bias false to remove from the Polynomial for X[0]=1) and then use the Ridge method
ridge_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('ridge', Ridge())
])

#In a dictionary, Step name _ _ and then parameter name, and then : with the type to do
dists = {'ridge__alpha' : uniform(loc=0, scale=200), 'pca__n_components' : [60, 62, 64, 66]}
#Cross Validation (CV uses the n parts (1/5) for cv=5, n_inter is the number of models, )
ridge_CV = RandomizedSearchCV(ridge_pipe, param_distributions=dists, n_iter=30, cv=5)
#Now we train the model
ridge_CV.fit(X, y)

Wall time: 37.9 s


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                             ('pca', PCA()),
                                             ('ridge', Ridge())]),
                   n_iter=30,
                   param_distributions={'pca__n_components': [60, 62, 64, 66],
                                        'ridge__alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001BF459469A0>})

In [26]:
ridge_CV.best_params_

{'pca__n_components': 66, 'ridge__alpha': 6.125747369012879}

In [27]:
ridge_CV.score(X, y)

0.2556934780610557

In [28]:
ridge_CV.score(X_test, y_test)

0.2765566920062328

In [29]:
mean_absolute_error(ridge_CV.predict(X), y)

2.1506439757612035

In [30]:
mean_absolute_error(ridge_CV.predict(X_test), y_test)

2.1624782942760796

In [31]:
%%time
#We create the Pipeline. First step, Normalize data, include the Polynomimal 
#(include bias false to remove from the Polynomial for X[0]=1) and then use the Ridge method
ridge_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(include_bias=False)),
    ('pca', PCA()),
    ('ridge', Ridge())
])

#In a dictionary, Step name _ _ and then parameter name, and then : with the type to do
dists = {'ridge__alpha' : uniform(loc=0, scale=200), 'poly__degree': [1,2], 'pca__n_components' : [30, 40, 50, 60]}
#Cross Validation (CV uses the n parts (1/5) for cv=5, n_inter is the number of models, )
ridge_CV = RandomizedSearchCV(ridge_pipe, param_distributions=dists, n_iter=30, cv=5)
#Now we train the model
ridge_CV.fit(X, y)

Wall time: 5min 44s


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                             ('poly',
                                              PolynomialFeatures(include_bias=False)),
                                             ('pca', PCA()),
                                             ('ridge', Ridge())]),
                   n_iter=30,
                   param_distributions={'pca__n_components': [30, 40, 50, 60],
                                        'poly__degree': [1, 2],
                                        'ridge__alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001BF44E5AF40>})

In [32]:
print("The best Parameters are :", ridge_CV.best_params_)
print("The best X,y score is: ", ridge_CV.score(X, y))
print("The best X,y test score is: ", ridge_CV.score(X_test, y_test))
print("The MAE X,y is: ", mean_absolute_error(ridge_CV.predict(X), y))
print("The MAE test X,y is: ", mean_absolute_error(ridge_CV.predict(X_test), y_test))

The best Parameters are : {'pca__n_components': 60, 'poly__degree': 1, 'ridge__alpha': 172.52565962730432}
The best X,y score is:  0.2548817850785814
The best X,y test score is:  0.27604559239218884
The MAE X,y is:  2.152768892577137
The MAE test X,y is:  2.1634719034809025


In [33]:
%%time
#We create the Pipeline. First step, Normalize data, include the Polynomimal 
#(include bias false to remove from the Polynomial for X[0]=1) and then use the Ridge method
linear_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('linear', LinearRegression())
])

#In a dictionary, Step name _ _ and then parameter name, and then : with the type to do
#dists = {'ridge__alpha' : uniform(loc=0, scale=200), 'poly__degree': [1,2], 'pca__n_components' : [30, 40, 50, 60]}
#Cross Validation (CV uses the n parts (1/5) for cv=5, n_inter is the number of models, )
#Now we train the model
linear_pipe.fit(X, y)

Wall time: 191 ms


Pipeline(steps=[('scaler', StandardScaler()), ('linear', LinearRegression())])

In [34]:
print("The best X,y score is: ", linear_pipe.score(X, y))
print("The best X,y test score is: ", linear_pipe.score(X_test, y_test))
print("The MAE X,y is: ", mean_absolute_error(linear_pipe.predict(X), y))
print("The MAE test X,y is: ", mean_absolute_error(linear_pipe.predict(X_test), y_test))

The best X,y score is:  0.2562773006049426
The best X,y test score is:  -1.2877764535941084e+18
The MAE X,y is:  2.1501943618043127
The MAE test X,y is:  30263775.16614799


In [35]:
%%time
svm = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', LinearSVR(C=1000, epsilon=1, max_iter=10000, random_state=42))
])

svm.fit(X, y)

Wall time: 2min 17s




Pipeline(steps=[('scaler', StandardScaler()),
                ('svm',
                 LinearSVR(C=1000, epsilon=1, max_iter=10000,
                           random_state=42))])

In [36]:
print("The best X,y score is: ", svm.score(X, y))
print("The best X,y test score is: ", svm.score(X_test, y_test))
print("The MAE X,y is: ", mean_absolute_error(svm.predict(X), y))
print("The MAE test X,y is: ", mean_absolute_error(svm.predict(X_test), y_test))

The best X,y score is:  0.03367518705837347
The best X,y test score is:  0.14272707474426827
The MAE X,y is:  2.2704148752770594
The MAE test X,y is:  2.2635366707544993


In [37]:
%%time
svm = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVR(kernel='rbf', C=1000, epsilon=0.5))
])

svm.fit(X, y)

Wall time: 22min 10s


Pipeline(steps=[('scaler', StandardScaler()),
                ('svm', SVR(C=1000, epsilon=0.5))])

In [38]:
print("The best X,y score is: ", svm.score(X, y))
print("The best X,y test score is: ", svm.score(X_test, y_test))
print("The MAE X,y is: ", mean_absolute_error(svm.predict(X), y))
print("The MAE test X,y is: ", mean_absolute_error(svm.predict(X_test), y_test))

The best X,y score is:  0.48239334737287676
The best X,y test score is:  0.4509594337442103
The MAE X,y is:  1.7598939589719105
The MAE test X,y is:  1.825131240668002


In [39]:
%%time
#We create the Pipeline. First step, Normalize data, include the Polynomimal 
#(include bias false to remove from the Polynomial for X[0]=1) and then use the Ridge method
gnb_pipe = Pipeline([
    ('scaler', MinMaxScaler()),
    ('gnb', GaussianNB())
])

#In a dictionary, Step name _ _ and then parameter name, and then : with the type to do
#dists = {'gnb__alpha' : uniform(loc=0, scale=200)}
#Cross Validation (CV uses the n parts (1/5) for cv=5, n_inter is the number of models, )
#ridge_CV = RandomizedSearchCV(ridge_pipe, param_distributions=dists, n_iter=30, cv=5)
#Now we train the model
gnb_pipe.fit(X, y)

Wall time: 141 ms


Pipeline(steps=[('scaler', MinMaxScaler()), ('gnb', GaussianNB())])

In [40]:
print("The best X,y score is: ", gnb_pipe.score(X, y))
print("The best X,y test score is: ", gnb_pipe.score(X_test, y_test))
print("The MAE X,y is: ", mean_absolute_error(gnb_pipe.predict(X), y))
print("The MAE test X,y is: ", mean_absolute_error(gnb_pipe.predict(X_test), y_test))

The best X,y score is:  0.006657860916749964
The best X,y test score is:  0.005997429672997287
The MAE X,y is:  10.130515493359988
The MAE test X,y is:  10.173639868627731


In [43]:
%%time
#We create the Pipeline. First step, Normalize data, include the Polynomimal 
#(include bias false to remove from the Polynomial for X[0]=1) and then use the Ridge method
svr_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVR(C=1000, epsilon=0.5))
])

#In a dictionary, Step name _ _ and then parameter name, and then : with the type to do
dists = {'svm__C' : [1000, 3000, 5000], 'svm__epsilon': [0.001, 0.1, 1]}
#Cross Validation (CV uses the n parts (1/5) for cv=5, n_inter is the number of models, )
svr_CV = RandomizedSearchCV(svr_pipe, param_distributions=dists, n_iter=3, cv=5)
#Now we train the model
svr_CV.fit(X, y)

Wall time: 8h 36s


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                             ('svm',
                                              SVR(C=1000, epsilon=0.5))]),
                   n_iter=3,
                   param_distributions={'svm__C': [1000, 3000, 5000],
                                        'svm__epsilon': [0.001, 0.1, 1]})

In [44]:
print("The best Parameters are :", svr_CV.best_params_)
print("The best X,y score is: ", svr_CV.score(X, y))
print("The best X,y test score is: ", svr_CV.score(X_test, y_test))
print("The MAE X,y is: ", mean_absolute_error(svr_CV.predict(X), y))
print("The MAE test X,y is: ", mean_absolute_error(svr_CV.predict(X_test), y_test))

The best Parameters are : {'svm__epsilon': 1, 'svm__C': 1000}
The best X,y score is:  0.48312998658187045
The best X,y test score is:  0.45485682437869546
The MAE X,y is:  1.7693013134400775
The MAE test X,y is:  1.8224817093740744
