## Enfoque analítico: 

Se aplicará una regresión lineal multivariable con el CSV de vinos.

In [1]:
import pandas as pd
import numpy as np
import random
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv("winemag-data-130k-v2.csv")

In [3]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm


In [4]:
df.shape

(129971, 14)

In [5]:
df.columns

Index(['Unnamed: 0', 'country', 'description', 'designation', 'points',
       'price', 'province', 'region_1', 'region_2', 'taster_name',
       'taster_twitter_handle', 'title', 'variety', 'winery'],
      dtype='object')

In [6]:
df.dtypes

Unnamed: 0                 int64
country                   object
description               object
designation               object
points                     int64
price                    float64
province                  object
region_1                  object
region_2                  object
taster_name               object
taster_twitter_handle     object
title                     object
variety                   object
winery                    object
dtype: object

**Función de filtro por rango IQR:**

In [7]:
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1
print ("1° Cuartil: ", Q1)
print ("3° Cuartil: ", Q3)
print ("Rango intercuartílico: ", IQR)

1° Cuartil:  17.0
3° Cuartil:  42.0
Rango intercuartílico:  25.0


In [8]:
df_filtered = df[(df['price'] < (1.5 * IQR + Q3)) & (df['price'] > (-1.5 * IQR + Q1))]

In [None]:
ax = sns.distplot(df_filtered['price'])

### Proceso la característica countries:

In [None]:
df_filtered['country'].unique()

**Ahora establezco la transformación para clasificar los países en los continentes correspondientes, y lo almaceno en el nuevo DF:**

In [None]:
continents = {0:('Georgia', 'China', 'India'),
              1:('South Africa', 'Morocco', 'Egypt'),
              2:('US', 'Mexico', 'Canada'),
              3:('Argentina', 'Chile', 'Uruguay', 'Brazil', 'Peru'),
              5:('Italy', 'Portugal', 'Spain', 'France', 'Germany', 'Austria', 
                 'Israel', 'Hungary', 'Greece', 'Romania', 'Turkey', 'Czech Republic', 
                 'Slovenia', 'Luxembourg', 'Croatia', 'England', 'Lebanon', 'Serbia', 
                 'Moldova', 'Bulgaria', 'Cyprus', 'Armenia', 'Switzerland',
                 'Bosnia and Herzegovina', 'Ukraine', 'Slovakia', 'Macedonia'),
              6:('Australia', 'New Zealand')}

In [None]:
def get_continent(country):
    for ind, v in continents.items():
        if country in v:
            return ind
    return -1

In [None]:
df_filtered = df_filtered.assign(continents = df_filtered['country'].apply(lambda x: get_continent(x)))

In [None]:
ax = sns.distplot(df_filtered['continents'])

Hago OneHotEncoder para aplicar más adelante una regresión lineal

In [None]:
enc = OneHotEncoder(handle_unknown='ignore')
data = enc.fit_transform(df_filtered[['continents']]).toarray()
a = pd.DataFrame(data = data)

In [None]:
a.columns = ['continents_0', 'continents_1', 'continents_2', 'continents_3', 'continents_4', 'continents_5', 'continents_6']

In [None]:
a

In [None]:
a.reset_index(drop=True, inplace=True)
df_filtered.reset_index(drop=True, inplace=True)

In [None]:
a

In [None]:
pd.concat([a, df_filtered])

### A Partir de acá comienzo las tareas de Machine Learning (Regresión Lineal):

**Realizo Split 70/30:**

In [None]:
msk = np.random.rand(len(df_filtered)) < 0.7

In [None]:
train = df_filtered[msk]
test = df_filtered[~msk]

In [None]:
print(train.shape)
print(test.shape)

In [None]:
# Creo el objeto de Regresion lineal
regr = linear_model.LinearRegression()

**Aplico la transformación entre la característica 'price' y 'points'**

In [None]:
regr.fit(train[['price']], train[['points']])

Y ahora calculo predicciones para los parámetros de prueba:

In [None]:
points_y_pred = regr.predict(test[['price']])

A continuación, muestro los coeficientes de la recta generada y métricas:
    **error cuadrático medio**;
    **R2**

In [None]:
print('Coeficientes: \n', regr.coef_, regr.intercept_)
print("MSE: %.2f"
      % mean_squared_error(test['points'], points_y_pred))

Graficación:

In [None]:
plt.scatter(test['price'], test['points'],  color='grey')
plt.plot(test['price'], points_y_pred, color='blue', linewidth=1)

plt.xticks(())
plt.yticks(())

plt.show()

### Ahora analizo la relación 'points' vs 'continents'

In [None]:
plt.scatter(train['continents'], train['points'],  color='grey')

In [None]:
regr.fit(train[['continents']], train[['points']])