# Modelos

## 1. Regresión Lineal

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

from sklearn.decomposition import PCA

from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error

from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from scipy.stats import uniform

from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_absolute_error

from sklearn.tree import DecisionTreeRegressor

from sklearn.svm import SVC, SVR, LinearSVR

pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('Wine_reviews_climate_prediction.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65838 entries, 0 to 65837
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      65838 non-null  int64  
 1   country         65838 non-null  object 
 2   description     65838 non-null  object 
 3   points          65838 non-null  int64  
 4   price           60311 non-null  float64
 5   taster_name     65838 non-null  object 
 6   variety         65838 non-null  object 
 7   winery          65838 non-null  object 
 8   Year            65838 non-null  int64  
 9   region          65838 non-null  object 
 10  Latitude        65838 non-null  float64
 11  Longitude       65838 non-null  float64
 12  Lat_x           65838 non-null  float64
 13  Long_x          65838 non-null  float64
 14  temp_anual      65838 non-null  float64
 15  temp_max_anual  65838 non-null  float64
 16  temp_min_anual  65838 non-null  float64
 17  pre_anual       65838 non-null 

In [4]:
df.isnull().sum()

Unnamed: 0           0
country              0
description          0
points               0
price             5527
taster_name          0
variety              0
winery               0
Year                 0
region               0
Latitude             0
Longitude            0
Lat_x                0
Long_x               0
temp_anual           0
temp_max_anual       0
temp_min_anual       0
pre_anual            0
etp_anual            0
dtype: int64

In [5]:
#Deleteing rows with price = 0
df.dropna(subset = ['price'], inplace=True)
df.isnull().sum()

Unnamed: 0        0
country           0
description       0
points            0
price             0
taster_name       0
variety           0
winery            0
Year              0
region            0
Latitude          0
Longitude         0
Lat_x             0
Long_x            0
temp_anual        0
temp_max_anual    0
temp_min_anual    0
pre_anual         0
etp_anual         0
dtype: int64

In [6]:
df.drop(columns=['description', 'variety', 'winery', 'Latitude', 'Longitude', 'region', 'Unnamed: 0'], inplace=True)
df

Unnamed: 0,country,points,price,taster_name,Year,Lat_x,Long_x,temp_anual,temp_max_anual,temp_min_anual,pre_anual,etp_anual
0,Portugal,87,15.0,Roger Voss,2011,41.25,-5.25,13.10,19.70,6.57,298.7,1206
2,Portugal,87,15.0,Roger Voss,2011,41.25,-5.25,13.10,19.70,6.57,298.7,1206
3,Portugal,87,17.0,Roger Voss,2011,41.25,-5.25,13.10,19.70,6.57,298.7,1206
4,Portugal,91,12.0,Roger Voss,2011,41.25,-5.25,13.10,19.70,6.57,298.7,1206
5,Portugal,87,8.0,Roger Voss,2011,41.25,-5.25,13.10,19.70,6.57,298.7,1206
...,...,...,...,...,...,...,...,...,...,...,...,...
65833,France,86,10.0,Lauren Buzzeo,2009,-5.75,-35.25,26.38,30.47,22.32,1531.9,1332
65834,Israel,84,12.0,Lauren Buzzeo,2010,30.75,34.75,21.45,27.60,15.35,81.8,1779
65835,Italy,86,15.0,Kerin O’Keefe,2012,40.25,18.25,17.46,21.31,13.65,719.4,1176
65836,Switzerland,90,21.0,Jeff Jenssen,2012,46.75,6.75,9.15,13.46,4.87,1382.5,696


In [7]:
df = pd.get_dummies(df, columns=['country', 'taster_name'])

In [12]:
data = df

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60311 entries, 0 to 65837
Data columns (total 70 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   points                          60311 non-null  int64  
 1   price                           60311 non-null  float64
 2   Year                            60311 non-null  int64  
 3   Lat_x                           60311 non-null  float64
 4   Long_x                          60311 non-null  float64
 5   temp_anual                      60311 non-null  float64
 6   temp_max_anual                  60311 non-null  float64
 7   temp_min_anual                  60311 non-null  float64
 8   pre_anual                       60311 non-null  float64
 9   etp_anual                       60311 non-null  int64  
 10  country_Argentina               60311 non-null  uint8  
 11  country_Australia               60311 non-null  uint8  
 12  country_Austria                 

In [13]:
#First we separate our info, the X will be all the columns except the output variable, then the output variable, the test size 20% and the random state to have always the same output
# is like X, X_test, y, y_test = train_test_split(variables_X, variable_Y, test_size, random_state)
X, X_test, y, y_test = train_test_split(data.drop(columns='points'), data.points, test_size=0.2, random_state=42)

In [22]:
%%time
#We create the Pipeline. First step, Normalize data, include the Polynomimal 
#(include bias false to remove from the Polynomial for X[0]=1) and then use the Ridge method
ridge_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(include_bias=False)),
    ('ridge', Ridge())
])

#In a dictionary, Step name _ _ and then parameter name, and then : with the type to do
dists = {'ridge__alpha' : uniform(loc=0, scale=200), 'poly__degree' : [1, 2]}
#Cross Validation (CV uses the n parts (1/5) for cv=5, n_inter is the number of models, )
ridge_CV = RandomizedSearchCV(ridge_pipe, param_distributions=dists, n_iter=30, cv=5)
#Now we train the model
ridge_CV.fit(X, y)

Wall time: 12min 23s


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                             ('poly',
                                              PolynomialFeatures(include_bias=False)),
                                             ('ridge', Ridge())]),
                   n_iter=30,
                   param_distributions={'poly__degree': [1, 2],
                                        'ridge__alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000231BB2390A0>})

In [23]:
ridge_CV.best_params_

{'poly__degree': 2, 'ridge__alpha': 166.24009617277838}

In [24]:
ridge_CV.score(X, y)

0.3924853850930041

In [25]:
ridge_CV.score(X_test, y_test)

0.392756237068803

In [28]:
mean_absolute_error(ridge_CV.predict(X), y)

1.9586243163134982

In [29]:
mean_absolute_error(ridge_CV.predict(X_test), y_test)

1.9600568695446499

In [97]:
%%time
#Now we try a Tree Regressor
tree = DecisionTreeRegressor()

#In a dictionary, Step name _ _ and then parameter name, and then : with the type to do
param_grid = {'max_depth' : [10, 11, 12, 13, 14, 15], 'min_samples_leaf' : [47, 48, 49, 50, 51, 52, 53]}

#Cross Validation (CV uses the n parts (1/5) for cv=5, n_inter is the number of models, )
tree_CV = GridSearchCV(tree, param_grid=param_grid)
#Now we train the model
tree_CV.fit(X, y)

Wall time: 1min 28s


GridSearchCV(estimator=DecisionTreeRegressor(),
             param_grid={'max_depth': [10, 11, 12, 13, 14, 15],
                         'min_samples_leaf': [47, 48, 49, 50, 51, 52, 53]})

In [98]:
print("The best Parameters are :", tree_CV.best_params_)
print("The best X,y score is: ", tree_CV.score(X, y))
print("The best X,y test score is: ", tree_CV.score(X_test, y_test))
print("The MAE X,y is: ", mean_absolute_error(tree_CV.predict(X), y))
print("The MAE test X,y is: ", mean_absolute_error(tree_CV.predict(X_test), y_test))

The best Parameters are : {'max_depth': 13, 'min_samples_leaf': 51}
The best X,y score is:  0.4835437428790962
The best X,y test score is:  0.4450123421740867
The MAE X,y is:  1.7918503363274374
The MAE test X,y is:  1.856160252006487


In [83]:
plt.figure(figsize = (10,8))
tree.plot_tree(tree_CV, filled = True, feature_names= X.columns)
plt.show()

AttributeError: 'DecisionTreeRegressor' object has no attribute 'plot_tree'

<Figure size 720x576 with 0 Axes>

In [42]:
%%time
#We create the Pipeline. First step, Normalize data, include the Polynomimal 
#(include bias false to remove from the Polynomial for X[0]=1) and then use the Ridge method
ridge_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('ridge', Ridge())
])

#In a dictionary, Step name _ _ and then parameter name, and then : with the type to do
dists = {'ridge__alpha' : uniform(loc=0, scale=200), 'pca__n_components' : [10, 20, 30, 40, 60]}
#Cross Validation (CV uses the n parts (1/5) for cv=5, n_inter is the number of models, )
ridge_CV = RandomizedSearchCV(ridge_pipe, param_distributions=dists, n_iter=30, cv=5)
#Now we train the model
ridge_CV.fit(X, y)

Wall time: 1min 57s


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                             ('pca', PCA()),
                                             ('ridge', Ridge())]),
                   n_iter=30,
                   param_distributions={'pca__n_components': [10, 20, 30, 40,
                                                              60],
                                        'ridge__alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000231BB2385E0>})

In [43]:
ridge_CV.best_params_

{'pca__n_components': 60, 'ridge__alpha': 147.28488031431766}

In [49]:
ridge_CV.score(X, y)

0.25360981580919206

In [50]:
ridge_CV.score(X_test, y_test)

0.2897108479615661

In [51]:
mean_absolute_error(ridge_CV.predict(X), y)

2.1641752784027126

In [52]:
mean_absolute_error(ridge_CV.predict(X_test), y_test)

2.1590588921907137

In [53]:
%%time
#We create the Pipeline. First step, Normalize data, include the Polynomimal 
#(include bias false to remove from the Polynomial for X[0]=1) and then use the Ridge method
ridge_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('ridge', Ridge())
])

#In a dictionary, Step name _ _ and then parameter name, and then : with the type to do
dists = {'ridge__alpha' : uniform(loc=0, scale=200), 'pca__n_components' : [60, 62, 64, 66]}
#Cross Validation (CV uses the n parts (1/5) for cv=5, n_inter is the number of models, )
ridge_CV = RandomizedSearchCV(ridge_pipe, param_distributions=dists, n_iter=30, cv=5)
#Now we train the model
ridge_CV.fit(X, y)

Wall time: 1min 21s


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                             ('pca', PCA()),
                                             ('ridge', Ridge())]),
                   n_iter=30,
                   param_distributions={'pca__n_components': [60, 62, 64, 66],
                                        'ridge__alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000231BB2859D0>})

In [54]:
ridge_CV.best_params_

{'pca__n_components': 66, 'ridge__alpha': 162.8536259267842}

In [55]:
ridge_CV.score(X, y)

0.25518260951638316

In [56]:
ridge_CV.score(X_test, y_test)

0.29169151231205026

In [57]:
mean_absolute_error(ridge_CV.predict(X), y)

2.161276388139255

In [58]:
mean_absolute_error(ridge_CV.predict(X_test), y_test)

2.154613546521073

In [59]:
%%time
#We create the Pipeline. First step, Normalize data, include the Polynomimal 
#(include bias false to remove from the Polynomial for X[0]=1) and then use the Ridge method
ridge_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(include_bias=False)),
    ('pca', PCA()),
    ('ridge', Ridge())
])

#In a dictionary, Step name _ _ and then parameter name, and then : with the type to do
dists = {'ridge__alpha' : uniform(loc=0, scale=200), 'poly__degree': [1,2], 'pca__n_components' : [30, 40, 50, 60]}
#Cross Validation (CV uses the n parts (1/5) for cv=5, n_inter is the number of models, )
ridge_CV = RandomizedSearchCV(ridge_pipe, param_distributions=dists, n_iter=30, cv=5)
#Now we train the model
ridge_CV.fit(X, y)

Wall time: 10min 12s


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                             ('poly',
                                              PolynomialFeatures(include_bias=False)),
                                             ('pca', PCA()),
                                             ('ridge', Ridge())]),
                   n_iter=30,
                   param_distributions={'pca__n_components': [30, 40, 50, 60],
                                        'poly__degree': [1, 2],
                                        'ridge__alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000231BB2A5D60>})

In [62]:
print("The best Parameters are :", ridge_CV.best_params_)
print("The best X,y score is: ", ridge_CV.score(X, y))
print("The best X,y test score is: ", ridge_CV.score(X_test, y_test))
print("The MAE X,y is: ", mean_absolute_error(ridge_CV.predict(X), y))
print("The MAE test X,y is: ", mean_absolute_error(ridge_CV.predict(X_test), y_test))

The best Parameters are : {'pca__n_components': 60, 'poly__degree': 1, 'ridge__alpha': 181.45990701324016}
The best X,y score is:  0.2536085421827321
The best X,y test score is:  0.2896586436603422
The MAE X,y is:  2.164309793843175
The MAE test X,y is:  2.1591961188551423


In [None]:
%%time
#We create the Pipeline. First step, Normalize data, include the Polynomimal 
#(include bias false to remove from the Polynomial for X[0]=1) and then use the Ridge method
linear_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('linear', LinearRegression())
])

#In a dictionary, Step name _ _ and then parameter name, and then : with the type to do
#dists = {'ridge__alpha' : uniform(loc=0, scale=200), 'poly__degree': [1,2], 'pca__n_components' : [30, 40, 50, 60]}
#Cross Validation (CV uses the n parts (1/5) for cv=5, n_inter is the number of models, )
#Now we train the model
linear_pipe.fit(X, y)

In [102]:
print("The best X,y score is: ", linear_pipe.score(X, y))
print("The best X,y test score is: ", linear_pipe.score(X_test, y_test))
print("The MAE X,y is: ", mean_absolute_error(linear_pipe.predict(X), y))
print("The MAE test X,y is: ", mean_absolute_error(linear_pipe.predict(X_test), y_test))

The best X,y score is:  0.2552441568778654
The best X,y test score is:  0.29199847481116614
The MAE X,y is:  2.1605649713768558
The MAE test X,y is:  2.153679593342132


In [None]:
%%time
svm = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', LinearSVR(C=1000, epsilon=0.5, max_iter=1000, random_state=42))
])

svm.fit(X, y)

In [None]:
print("The best X,y score is: ", svm.score(X, y))
print("The best X,y test score is: ", svm.score(X_test, y_test))
print("The MAE X,y is: ", mean_absolute_error(svm.predict(X), y))
print("The MAE test X,y is: ", mean_absolute_error(svm.predict(X_test), y_test))

In [None]:
%%time
svm = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVR(kernel='rbf', C=1000, epsilon=0.5))
])

svm.fit(X, y)

In [None]:
print("The best X,y score is: ", svm.score(X, y))
print("The best X,y test score is: ", svm.score(X_test, y_test))
print("The MAE X,y is: ", mean_absolute_error(svm.predict(X), y))
print("The MAE test X,y is: ", mean_absolute_error(svm.predict(X_test), y_test))