In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import tree

# Import Raw Data

In [3]:
df = pd.read_json('C:/Users/Eason/Downloads/winemag-data_first150k.json/winemag-data_first150k.json')

In [4]:
df

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude
5,Spain,"Deep, dense and pure from the opening bell, th...",Numanthia,95,73.0,Northern Spain,Toro,,Tinta de Toro,Numanthia
6,Spain,Slightly gritty black-fruit aromas include a s...,San Román,95,65.0,Northern Spain,Toro,,Tinta de Toro,Maurodos
7,Spain,Lush cedary black-fruit aromas are luxe and of...,Carodorum Único Crianza,95,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
8,US,This re-named vineyard was formerly bottled as...,Silice,95,65.0,Oregon,Chehalem Mountains,Willamette Valley,Pinot Noir,Bergström
9,US,The producer sources from two blocks of the vi...,Gap's Crown Vineyard,95,60.0,California,Sonoma Coast,Sonoma,Pinot Noir,Blue Farm


# Clean the Data

In [5]:
countries = df['country'].unique()
country_count=[]
countries_kept = []
for i in range(0,len(countries)):
    country_count.append(len(df.loc[df['country'] == countries[i]]))
for j in range(0,len(countries)):
    if country_count[j]>500:
        countries_kept.append(countries[j])
df = df[df['country'].isin(countries_kept)]

df = df[np.isfinite(df['price'])]
df = df[np.isfinite(df['points'])]

amount_of_varieties = len(df['variety'].unique())
varietal = df['variety'].unique()
varietals = []
amount_of_each_variety = []
Tol = 74
for i in range(0,amount_of_varieties):
    if len(df.loc[df['variety'] == varietal[i]]) > Tol:
        amount_of_each_variety.append(len(df.loc[df['variety'] == varietal[i]]))
        varietals.append(varietal[i])
df = df[df['variety'].isin(varietals)]

# Useful Functions

In [6]:
def R2_score(y_pred,y_true):
    # u is the residual sum of squares
    u = ((y_true - y_pred) ** 2).sum()
    # v is the total sum of squares
    v = ((y_true - y_true.mean()) ** 2).sum()
    return (1-u/v)

In [7]:
def Transform_df_to_X(df):
    # To get variety data via one hot encoding
    varieties_kpt = df['variety'].unique()
    dummy_variety = pd.get_dummies(df['variety'])
    variety = []
    for i in range(0,len(df['variety'].unique())):
        variety.append(dummy_variety[varieties_kpt[i]])
        
    # To get country data via one hot encoding
    countries_kpt = df['country'].unique()
    dummy = pd.get_dummies(df['country'])
    country = []
    for i in range(0,len(df['country'].unique())):
        country.append(dummy[countries_kpt[i]])
        
    X = df[['points']].as_matrix()

    for i in range(0,len(country)):
        X = np.c_[X,country[i]]
    for j in range(0,len(variety)):
        X = np.c_[X,variety[j]]
    X = np.c_[X,np.ones(len(df['points']))]
    return X

In [8]:
def Wine_Decision_Tree_Regression_QF(Xtrain,Xtest,Ytrain):
    clf = tree.DecisionTreeRegressor()
    # useful code: min_samples_leaf=10,max_depth=3,max_leaf_nodes = 100
    clf = clf.fit(Xtrain, Ytrain)
    return clf.predict(Xtest) # Returns the prediction vector

In [54]:
def Wine_LinLstSq_Regression_QF(Xtrain,Xtest,Ytrain):
    w,resdiuals,rank,singular_vals = np.linalg.lstsq(Xtrain, Ytrain)
    w_matrix = np.transpose(np.asmatrix(w))
    w_array = np.squeeze(np.asarray(Xtest*w_matrix))
    return w_array # Returns the prediction vector

# Train/Test Set Split

In [10]:
X = Transform_df_to_X(df)
data = X
target = df['price'].as_matrix()

In [11]:
# total examples after data cleaning: 129964
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=0)

# Example on Split data

In [12]:
R2_score((Wine_Decision_Tree_Regression_QF(X_train,X_test,y_train)),y_test)

0.53318894011386031

In [55]:
R2_score((Wine_LinLstSq_Regression_QF(X_train,X_test,y_train)),y_test)

0.27612856045794587

# Cross Validation 

This still needs some attention, but it should be a quick fix. Example of one run is below:

In [13]:
scores = []
for k in range(0, 10):
    X_tr, X_te, y_tr, y_te = train_test_split(X_train, y_train, test_size=0.1, random_state=k)
    train_data_input = X_tr
    train_data_output = y_tr
    test_data_input = X_te
    y_pred = Wine_Decision_Tree_Regression_QF(train_data_input, test_data_input, train_data_output)
    print(R2_score(y_pred,y_te))
    scores.append(R2_score(y_pred,y_te))
np.mean(scores)

0.210893944844
0.523850666848
0.177200610047
0.527898104625
0.50002328685
0.579740022631
0.460139076228
0.438472575606
0.535115523446
0.490217834946


0.44435516460687785

In [14]:
# input: model (function), train set (numpy matrix)
# output: cross validated absolute loss
def cross_validate(model):
    scores = np.empty(0)
    for k in range(0, 10):
        X_tr, X_te, y_tr, y_te = train_test_split(X_train, y_train, test_size=0.1, random_state=k)
        train_data = pd.DataFrame(X_tr, columns=features)
        train_data = train_data.assign(price = y_tr)
        test_data = pd.DataFrame(X_te, columns=features)
        y_pred = model(train_data, test_data)
        np.append(scores, metrics.mean_absolute_error(y_te,y_pred))
    return np.mean(scores)

# Test Set Performance

In [15]:
# input: model (function), test set (numpy matrix)
# output: absolute loss
def test_score(model):
    train_data = pd.DataFrame(X_train, columns=features)
    train_data = train_data.assign(price = y_test)
    test_data = pd.DataFrame(X_test, columns=features)
    y_pred = model(train_data, test_data)
    score = metrics.mean_absolute_error(y_test,y_pred)
    return