In [3]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LassoCV
from sklearn.linear_model import lasso_path
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
%matplotlib inline

In [4]:
def pad_ones_column(X):
    """Add columns of ones to a data matrix"""
    return np.c_[ np.ones(len(ORIGINAL_DF)), X ]

def estimate_beta(X, y):
    """Compte estimates for beta given X and y
    
    Beta = (X^T X)^-1 X^T y
    """
    t1 = np.linalg.inv(np.matmul(X.T, X))
    t2 = np.matmul(X.T, y)
    return np.matmul(t1, t2)

def run_complete_linear_regression(X, y, beta_hat):
    """Run linear regression and produce error statistics"""
    # Make predictions based on testing data
    predicted_y = np.matmul(X, beta_hat)
    
    # Compute the error from the actual data
    errors = np.square(y - predicted_y)
    
    # Compute mean squared error
    MSE = np.mean(errors)
    
    # Compute sum squared error
    SSE = np.sum(errors)

    return MSE, SSE

def run_k_fold(X, y, n_splits=10, print_stuff=False):
    """Run k-fold cross validation"""
    kf = KFold(n_splits=n_splits)
    storeResultsMSE = []
    storeResultsSSE = []

    for i, (train_index, test_index) in enumerate(kf.split(X)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        beta_hat = estimate_beta(X_train, y_train)

        test_MSE, test_SSE = run_complete_linear_regression(X_test, y_test, beta_hat)

        storeResultsMSE.append(test_MSE)
        storeResultsSSE.append(test_SSE)
        
        if print_stuff:
            print("Iteration:", i, "MSE: ", test_MSE, "SSE:", test_SSE)  
    if print_stuff:
        print("Average MSE:", np.mean(storeResultsMSE), "Average SSE:", np.mean(storeResultsSSE))
    
    return np.mean(storeResultsMSE)

In [5]:
data = pd.read_csv("final_data.csv", index_col=0).dropna()

In [6]:
X_d = data.drop(['review/appearance', 'review/aroma', 'review/overall', 'review/palate', 'review/taste', 'avg_palate', 'avg_aroma', 'avg_overall', 'avg_appear', 'avg_taste'], axis=1)
X = np.matrix(X_d)
y = data['review/overall'].values

In [7]:
# Fit Lasso Model for review overall
X = np.matrix(data.drop(['review/appearance', 'review/aroma', 'review/overall', 'review/palate', 'review/taste'], axis=1))
y_overall = data['review/overall'].values
model_overa = LassoCV(cv=10, fit_intercept=True).fit(X,y_overall)

# Fit Lasso Model for review appearance
y_appear = data['review/appearance'].values
model_appear = LassoCV(cv=10, fit_intercept=True).fit(X,y_appear)

# Fit Lasso Model for review aroma
y_aroma = data['review/aroma'].values
model_aroma = LassoCV(cv=10, fit_intercept=True).fit(X,y_aroma)

# Fit Lasso Model for review palate
y_palate = data['review/palate'].values
model_palate = LassoCV(cv=10, fit_intercept=True).fit(X,y_palate)

# Fit Lasso Model for review taste
y_taste = data['review/taste'].values
model_taste = LassoCV(cv=10, fit_intercept=True).fit(X,y_taste)

In [8]:
# Select feature columns for each dataset
ind_overall = [i for i, x in enumerate(model_overa.coef_) if x > 0]
ind_appear = [i for i, x in enumerate(model_appear.coef_) if x > 0]
ind_aroma = [i for i, x in enumerate(model_aroma.coef_) if x > 0]
ind_palate = [i for i, x in enumerate(model_palate.coef_) if x > 0]
ind_taste = [i for i, x in enumerate(model_taste.coef_) if x > 0]

In [22]:
ind_overall == ind_appear == ind_aroma == ind_palate == ind_taste

False

In [23]:
# Select the columns for each variable
X_overall = X[: , ind_overall]
X_appear = X[: , ind_appear]
X_aroma = X[: , ind_aroma]
X_palate = X[: , ind_palate]
X_taste = X[: , ind_taste]

In [24]:
# Overall get MSE
run_k_fold(X_overall, np.matrix(y_overall).T)

0.37931720352214943

In [25]:
# Appearance get MSE
run_k_fold(X_appear, np.matrix(y_appear).T)

0.25193336607437378

In [26]:
# Aroma get MSE
run_k_fold(X_aroma, np.matrix(y_aroma).T)

0.29041628410206644

In [27]:
# Palate get MSE
run_k_fold(X_palate, np.matrix(y_palate).T)

0.30537309387326644

In [28]:
# Taste get MSE
run_k_fold(X_taste, np.matrix(y_taste).T)

0.33206299158102531

In [47]:
# Look at features for Overall
X_d.iloc[:, ind_overall].columns

Index(['5 Day IPA', 'Aecht Schlenkerla Eiche', 'Aecht Schlenkerla Fastenbier',
       'Aecht Schlenkerla Helles Lagerbier', 'American Pale Ale',
       'B.O.R.I.S. The Crusher Oatmeal-Imperial Stout',
       'Barrel Aged B.O.R.I.S. Oatmeal Imperial Stout',
       'Black Sheep Ale (Special)', 'Caldera IPA',
       'D.O.R.I.S. The Destroyer Double Imperial Stout',
       'Founders Breakfast Stout', 'Founders CBS Imperial Stout',
       'Founders Centennial IPA', 'Founders Double Trouble',
       'Founders Harvest Ale', 'Founders Imperial Stout',
       'Founders KBS (Kentucky Breakfast Stout)', 'Founders Porter',
       'Founders Red's Rye PA', 'Founders RÃ¼bÃ¦us', 'Hefeweizen',
       'Jefferson's Reserve Bourbon Barrel Stout', 'O.K. Beer',
       'Okocim Porter', 'Pike Kilt Lifter Scotch Style Ale',
       'Pike Street XXXXX Stout', 'Pilsner Urquell', 'Smooth Hoperator',
       'Stoudt's Gold Lager', 'Stoudts American Pale Ale', 'Stoudts Pils',
       'Trafalgar Celebration Ale', 'Welt

In [30]:
# Features for appear
X_d.iloc[:, ind_appear].shape

(37484, 56)

In [31]:
# Features for aroma
X_d.iloc[:, ind_aroma].shape

(37484, 48)

In [32]:
# Features for palate
X_d.iloc[:, ind_palate].shape

(37484, 51)

In [33]:
# Features for taste
X_d.iloc[:, ind_taste].shape

(37484, 51)

In [63]:
set(X_d.iloc[:, ind_aroma].columns) - set(X_d.iloc[:, ind_appear].columns).union(set(X_d.iloc[:, ind_overall].columns),
                                           set(X_d.iloc[:, ind_palate].columns),
                                           set(X_d.iloc[:, ind_taste].columns))

{'brwid__3268'}

In [77]:
list(set(X_d.iloc[:, ind_overall].columns) - set(X_d.iloc[:, ind_appear].columns).union(
                                           set(X_d.iloc[:, ind_aroma].columns),
                                           set(X_d.iloc[:, ind_palate].columns),
                                           set(X_d.iloc[:, ind_taste].columns)))

['Stoudts Pils',
 'mday__22.0',
 'Wheach',
 'Black Sheep Ale (Special)',
 'brwid__3282',
 'Aecht Schlenkerla Eiche',
 'yr__2005.0',
 'yr__2008.0',
 'Trafalgar Celebration Ale',
 'O.K. Beer',
 'brwid__2446',
 'Aecht Schlenkerla Helles Lagerbier',
 'Stoudts American Pale Ale',
 'hr__13.0',
 'Pike Kilt Lifter Scotch Style Ale']

In [67]:
set(X_d.iloc[:, ind_appear].columns) - set(X_d.iloc[:, ind_overall].columns).union(
                                            set(X_d.iloc[:, ind_palate].columns),
                                           set(X_d.iloc[:, ind_aroma].columns),
                                           set(X_d.iloc[:, ind_taste].columns))

{'Founders Black Rye', 'mday__25.0', 'mday__28.0', 'mon__12.0'}

In [70]:
set(X_d.iloc[:, ind_palate].columns) - set(X_d.iloc[:, ind_overall].columns).union(
                                           set(X_d.iloc[:, ind_appear].columns),
                                           set(X_d.iloc[:, ind_aroma].columns),
                                           set(X_d.iloc[:, ind_taste].columns))

{'yr__2006.0'}

In [74]:
set(X_d.iloc[:, ind_taste].columns) - set(X_d.iloc[:, ind_overall].columns).union(
                                            set(X_d.iloc[:, ind_appear].columns),
                                           set(X_d.iloc[:, ind_aroma].columns),
                                           set(X_d.iloc[:, ind_palate].columns),)

{'mday__4.0', 'mon__1.0', 'yr__2009.0'}

In [76]:
set(X_d.iloc[:, ind_appear].columns) - set(X_d.iloc[:, ind_overall].columns).union(
                                           set(X_d.iloc[:, ind_aroma].columns),
                                           set(X_d.iloc[:, ind_palate].columns),
                                           set(X_d.iloc[:, ind_taste].columns))

{'Founders Black Rye', 'mday__25.0', 'mday__28.0', 'mon__12.0'}

In [39]:
set(X_d.iloc[:, ind_appear].columns).intersection(set(X_d.iloc[:, ind_overall].columns),
                                           set(X_d.iloc[:, ind_aroma].columns),
                                           set(X_d.iloc[:, ind_palate].columns),
                                           set(X_d.iloc[:, ind_taste].columns))

{'0',
 '1',
 '3',
 '4',
 '5 Day IPA',
 'Caldera IPA',
 'Founders Breakfast Stout',
 'Founders KBS (Kentucky Breakfast Stout)',
 'Founders Porter',
 "Founders Red's Rye PA",
 'Founders RÃ¼bÃ¦us',
 'Okocim Porter',
 'brwid__1075',
 'brwid__1199',
 'brwid__14',
 'brwid__14879',
 'brwid__16386',
 'brwid__263',
 'brwid__394',
 'mday__20.0'}

In [40]:
len(set(X_d.iloc[:, ind_taste].columns).intersection(set(X_d.iloc[:, ind_palate].columns)))

37

In [48]:
len(set(X_d.iloc[:, ind_overall].columns).intersection(set(X_d.iloc[:, ind_palate].columns)))

48

In [49]:
len(set(X_d.iloc[:, ind_overall].columns).intersection(set(X_d.iloc[:, ind_taste].columns)))

44

In [50]:
len(set(X_d.iloc[:, ind_aroma].columns).intersection(set(X_d.iloc[:, ind_taste].columns)))

31

In [54]:
len(set(X_d.iloc[:, ind_appear].columns).intersection(set(X_d.iloc[:, ind_aroma].columns)))

33

In [1]:
X_overall = X[: , ind_overall]

NameError: name 'X' is not defined

In [11]:
# Select feature columns for each dataset
val_overall = [x for i, x in enumerate(model_overa.coef_) if x > 0]
val_appear = [x for i, x in enumerate(model_appear.coef_) if x > 0]
val_aroma = [x for i, x in enumerate(model_aroma.coef_) if x > 0]
val_palate = [x for i, x in enumerate(model_palate.coef_) if x > 0]
val_taste = [x for i, x in enumerate(model_taste.coef_) if x > 0]

In [13]:
# Select the columns for each variable
c_overall = X_d.columns[ind_overall]
c_appear = X_d.columns[ind_appear]
c_aroma = X_d.columns[ind_aroma]
c_palate = X_d.columns[ind_palate]
c_taste = X_d.columns[ind_taste]

In [21]:
s_palate = [[a, b] for a,b in zip(c_palate, val_palate)]
sorted(s_palate, key = lambda x: x[1], reverse=True)

[['4', 0.20135295592135088],
 ['0', 0.1829140470448018],
 ['B.O.R.I.S. The Crusher Oatmeal-Imperial Stout', 0.13550272469162847],
 ['brwid__14', 0.13162676095766418],
 ['brwid__1454', 0.1243205088187188],
 ['brwid__263', 0.12410224477741939],
 ['2', 0.11046050708099424],
 ['1', 0.1100914976733267],
 ['3', 0.10662886709842426],
 ['Founders Breakfast Stout', 0.10385133447832097],
 ['Founders Imperial Stout', 0.10260235982677376],
 ['brwid__1199', 0.1013157337890763],
 ['Barrel Aged B.O.R.I.S. Oatmeal Imperial Stout', 0.097218268499220536],
 ['Pilsner Urquell', 0.093368229101479405],
 ['Okocim Porter', 0.08379299933667568],
 ['Founders RÃ¼bÃ¦us', 0.074137019363073708],
 ['Founders KBS (Kentucky Breakfast Stout)', 0.073699862645055486],
 ['D.O.R.I.S. The Destroyer Double Imperial Stout', 0.073093047035445904],
 ['Founders CBS Imperial Stout', 0.065309870920728999],
 ['brwid__394', 0.062650854421503338],
 ['5 Day IPA', 0.054425318882469119],
 ['brwid__14879', 0.051313155377314908],
 ['Found