In [1]:
#import dependencies
import numpy as np
import pandas as pd
import requests
import os 

from io import StringIO

import matplotlib.pyplot as plt

In [2]:
wine_df_2017 = pd.read_csv('clean_2017.csv')
wine_df_2017

Unnamed: 0,country,designation,points,price,province,variety,winery
0,US,Martha's Vineyard,96,235.0,California,Cabernet Sauvignon,Heitz
1,Spain,Carodorum SelecciÃ³n Especial Reserva,96,110.0,Northern Spain,Tinta de Toro,Bodega Carmen RodrÃ­guez
2,US,Special Selected Late Harvest,96,90.0,California,Sauvignon Blanc,Macauley
3,US,Reserve,96,65.0,Oregon,Pinot Noir,Ponzi
4,France,La BrÃ»lade,95,66.0,Provence,Provence red blend,Domaine de la BÃ©gude
...,...,...,...,...,...,...,...
59745,France,Andlau,84,21.0,Alsace,Riesling,Marc Kreydenweiss
59746,US,Five-O,84,25.0,New York,Red Blend,Martha Clara
59747,US,Crimson Creek,84,27.0,California,Merlot,Pine Ridge
59748,Portugal,40-year old tawny,84,130.0,Port,Port,PoÃ§as


In [3]:
# Drop columns that are not needed for the ML piece
wine_ml_2017 = wine_df_2017.drop(['designation', 'province', 'winery', 'variety'], axis=1)
wine_ml_2017.head()

Unnamed: 0,country,points,price
0,US,96,235.0
1,Spain,96,110.0
2,US,96,90.0
3,US,96,65.0
4,France,95,66.0


In [4]:
wine_ml_2017.isnull().sum()

country    0
points     0
price      0
dtype: int64

In [5]:
# Convert country values that are not US to Other Country
wine_ml_2017['country'] = wine_ml_2017['country'].replace({'Spain':'Other', 'Italy':'Other', 'France':'Other', 'Chile':'Other', 'Not US': 'Other', 'Portugal':'Other', 'Argentina':'Other', 'Australia':'Other', 'Germany':'Other', 'New Zealand':'Other', 'South Africa':'Other', 'Greece':'Other', 'Israel':'Other', 'Canada':'Other', 'Hungary':'Other', 'Romania':'Other', 'Croatia':'Other', 'Slovenia':'Other', 'Bulgaria':'Other', 'Mexico':'Other', 'Moldova':'Other', 'Uruguay':'Other', 'Turkey':'Other', 'Georgia':'Other', 'Lebanon':'Other', 'Cyprus':'Other','Brazil':'Other', 'England':'Other', 'Serbia':'Other', 'Morocco':'Other', 'Ukraine':'Other', 'Lithuania':'Other', 'Macedonia':'Other', 'Switzerland':'Other', 'Luxembourg':'Other', 'India':'Other', 'South Korea':'Other', 'China':'Other', 'Bosnia and Herzegovina':'Other', 'Japan':'Other', 'Austria':'Other'})
wine_ml_2017.head()

Unnamed: 0,country,points,price
0,US,96,235.0
1,Other,96,110.0
2,US,96,90.0
3,US,96,65.0
4,Other,95,66.0


In [6]:
# shows the counts by country to see spread
# US is roughly half of the dataset
wine_ml_2017.groupby(['country']).count()

Unnamed: 0_level_0,points,price
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Other,34499,34499
US,25251,25251


In [7]:
# Convert the TARGET COLUMN values to good_wine and great_wine
wine_ml_2017.loc[wine_ml_2017.points >= 90, "ratings_desc"] = 1 #great_wine
wine_ml_2017.loc[wine_ml_2017.points < 90, "ratings_desc"] = 0 #good_value

# validate the ratings_desc column is working
wine_ml_2017.sort_values(['points'], ascending=True)

Unnamed: 0,country,points,price,ratings_desc
48598,US,80,20.0,0.0
16517,Other,80,10.0,0.0
33541,Other,80,40.0,0.0
38510,Other,80,20.0,0.0
38509,Other,80,9.0,0.0
...,...,...,...,...
46177,US,100,215.0,1.0
17630,Other,100,195.0,1.0
51821,Other,100,210.0,1.0
14905,Other,100,460.0,1.0


In [14]:
# Convert the COUNTRY COLUMN values to good_wine and great_wine
US_df = wine_ml_2017[wine_ml_2017.country == 'US']
US_df = US_df[['price', 'ratings_desc']]

US_df

Unnamed: 0,price,ratings_desc
0,235.0,1.0
2,90.0,1.0
3,65.0,1.0
8,65.0,1.0
9,60.0,1.0
...,...,...
59735,14.0,0.0
59738,20.0,0.0
59746,25.0,0.0
59747,27.0,0.0


In [15]:
Other_df = wine_ml_2017[wine_ml_2017.country == 'Other']
Other_df = Other_df[['price', 'ratings_desc']]
Other_df

Unnamed: 0,price,ratings_desc
1,110.0,1.0
4,66.0,1.0
5,73.0,1.0
6,65.0,1.0
7,110.0,1.0
...,...,...
59742,13.0,0.0
59743,13.0,0.0
59744,50.0,0.0
59745,21.0,0.0


### US Dataset

## 1. Split the Data into Training and Testing

In [16]:
# Create our features
X = US_df.drop('ratings_desc', axis =1 )

# Create our target
y = US_df['ratings_desc']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)

## 2. Create a Logistic Regression Model

In [17]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier

LogisticRegression(random_state=1)

 ## 3. Fit (train) or model using the training data

In [18]:
# Train the data
classifier.fit(X_train, y_train)

LogisticRegression(random_state=1)

## 4. Make Predictions

In [19]:
# Predict outcomes for test data set
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
23183,0.0,0.0
45765,0.0,0.0
10945,0.0,1.0
48122,0.0,0.0
46436,0.0,0.0
...,...,...
14311,1.0,0.0
45182,0.0,0.0
54501,0.0,0.0
1107,1.0,1.0


## 5. Validate the model using the test data

In [20]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.7001425629653096

In [21]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(predictions,y_test)
cm

array([[3315, 1421],
       [ 472, 1105]])

In [22]:
# Format for easier understanding
cm_df = pd.DataFrame(cm)
cm_df.columns = ['Predicted good_wine','Predicted great_wine']
cm_df = cm_df.rename(index={0: 'Actual good_wine',1:'Actual great_wine'})
cm_df

Unnamed: 0,Predicted good_wine,Predicted great_wine
Actual good_wine,3315,1421
Actual great_wine,472,1105


In [23]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test,predictions))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.70      0.88      0.44      0.78      0.62      0.40      3787
        1.0       0.70      0.44      0.88      0.54      0.62      0.37      2526

avg / total       0.70      0.70      0.61      0.68      0.62      0.39      6313



### Other Country DF

In [24]:
# Create our features
X = Other_df.drop('ratings_desc', axis =1 )

# Create our target
y = Other_df['ratings_desc']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)

In [25]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier

LogisticRegression(random_state=1)

In [26]:
# Train the data
classifier.fit(X_train, y_train)

LogisticRegression(random_state=1)

In [27]:
# Predict outcomes for test data set
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
58775,0.0,0.0
1797,1.0,1.0
32946,0.0,0.0
53467,0.0,0.0
17434,0.0,0.0
...,...,...
13341,1.0,1.0
16700,0.0,1.0
54533,0.0,0.0
10674,0.0,0.0


In [28]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.7742608695652174

In [29]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(predictions,y_test)
cm

array([[5359, 1545],
       [ 402, 1319]])

In [30]:
# Format for easier understanding
cm_df = pd.DataFrame(cm)
cm_df.columns = ['Predicted good_wine','Predicted great_wine']
cm_df = cm_df.rename(index={0: 'Actual good_wine',1:'Actual great_wine'})
cm_df

Unnamed: 0,Predicted good_wine,Predicted great_wine
Actual good_wine,5359,1545
Actual great_wine,402,1319


In [31]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test,predictions))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.78      0.93      0.46      0.85      0.65      0.45      5761
        1.0       0.77      0.46      0.93      0.58      0.65      0.41      2864

avg / total       0.77      0.77      0.62      0.76      0.65      0.44      8625

