In [6]:
import pandas as pd
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
%matplotlib inline

In [7]:
data = pd.read_csv('data/arabica_data_cleaned.csv')

In [8]:
data.columns

Index(['Unnamed: 0', 'Species', 'Owner', 'Country.of.Origin', 'Farm.Name',
       'Lot.Number', 'Mill', 'ICO.Number', 'Company', 'Altitude', 'Region',
       'Producer', 'Number.of.Bags', 'Bag.Weight', 'In.Country.Partner',
       'Harvest.Year', 'Grading.Date', 'Owner.1', 'Variety',
       'Processing.Method', 'Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body',
       'Balance', 'Uniformity', 'Clean.Cup', 'Sweetness', 'Cupper.Points',
       'Total.Cup.Points', 'Moisture', 'Category.One.Defects', 'Quakers',
       'Color', 'Category.Two.Defects', 'Expiration', 'Certification.Body',
       'Certification.Address', 'Certification.Contact', 'unit_of_measurement',
       'altitude_low_meters', 'altitude_high_meters', 'altitude_mean_meters'],
      dtype='object')

In [9]:
data.corr()

Unnamed: 0.1,Unnamed: 0,Number.of.Bags,Aroma,Flavor,Aftertaste,Acidity,Body,Balance,Uniformity,Clean.Cup,Sweetness,Cupper.Points,Total.Cup.Points,Moisture,Category.One.Defects,Quakers,Category.Two.Defects,altitude_low_meters,altitude_high_meters,altitude_mean_meters
Unnamed: 0,1.0,-0.069971,-0.628312,-0.731901,-0.728914,-0.649118,-0.592405,-0.686378,-0.330497,-0.308601,-0.202125,-0.708756,-0.691876,0.171479,0.103586,-0.015794,0.218386,0.042046,0.040379,0.041215
Number.of.Bags,-0.069971,1.0,-0.012999,-0.008464,-0.012116,0.021173,0.014003,0.01931,0.021568,0.028308,0.007161,-0.013167,0.011137,-0.054036,-0.072411,0.136818,0.002088,-0.02636,-0.026617,-0.02649
Aroma,-0.628312,-0.012999,1.0,0.814304,0.777373,0.724834,0.696161,0.716963,0.36582,0.334816,0.327503,0.693358,0.797364,-0.132734,-0.074917,0.010748,-0.146716,-0.015838,-0.014909,-0.015374
Flavor,-0.731901,-0.008464,0.814304,1.0,0.895253,0.816705,0.761683,0.802548,0.408781,0.395862,0.361538,0.797055,0.877988,-0.145263,-0.05415,0.006416,-0.190085,-0.011229,-0.009952,-0.010591
Aftertaste,-0.728914,-0.012116,0.777373,0.895253,1.0,0.792382,0.7607,0.823344,0.399422,0.385777,0.342819,0.78782,0.866111,-0.178816,-0.074873,0.002818,-0.207997,-0.022559,-0.021025,-0.021793
Acidity,-0.649118,0.021173,0.724834,0.816705,0.792382,1.0,0.732584,0.742307,0.373236,0.299461,0.330911,0.700338,0.801064,-0.126572,-0.061101,-0.012366,-0.133217,0.001543,0.002811,0.002177
Body,-0.592405,0.014003,0.696161,0.761683,0.7607,0.732584,1.0,0.767547,0.338724,0.286748,0.326758,0.671092,0.776533,-0.173301,-0.01547,-0.004506,-0.090751,-0.014716,-0.013075,-0.013897
Balance,-0.686378,0.01931,0.716963,0.802548,0.823344,0.742307,0.767547,1.0,0.40454,0.374242,0.342861,0.741491,0.83645,-0.211059,-0.062517,0.008117,-0.172854,-0.014086,-0.012174,-0.013131
Uniformity,-0.330497,0.021568,0.36582,0.408781,0.399422,0.373236,0.338724,0.40454,1.0,0.525415,0.537956,0.358547,0.658138,0.014526,-0.117382,0.02807,-0.144358,-0.010102,-0.009346,-0.009725
Clean.Cup,-0.308601,0.028308,0.334816,0.395862,0.385777,0.299461,0.286748,0.374242,0.525415,1.0,0.525851,0.35704,0.661387,-0.013631,-0.13946,0.025954,-0.231669,-0.004238,-0.004154,-0.004196


In [10]:
my_lasso = LassoCV()
y = data["Moisture"]
X = data[["Aroma", "Flavor", "Aftertaste", "Acidity", "Body", "Balance", "Uniformity", "Sweetness"]]
sc = StandardScaler()

x_tr, x_te, y_tr, y_te = train_test_split(X,y, random_state=42)

sc.fit(x_tr)
x_tr = sc.transform(x_tr)
x_te = sc.transform(x_te)

In [11]:
my_lasso.fit(x_tr, y_tr)


LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
    max_iter=1000, n_alphas=100, n_jobs=1, normalize=False, positive=False,
    precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
    verbose=False)

In [12]:
np.sqrt(np.mean((y_te - my_lasso.predict(x_te))**2))

0.048006110625965555

In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

x_t_reg, x_test, y_t_reg, y_test = train_test_split(X,y, random_state=42)
reg = RandomForestRegressor(random_state=88)

params = {"max_depth":[2,3,4,5],  "min_impurity_decrease":[1e-7,1e-6,1e-5],  "n_estimators":[100,150,200,300]}

gs = GridSearchCV(reg,params, n_jobs=-1, scoring = "neg_mean_squared_error")
gs.fit(x_t_reg,y_t_reg)

sc.fit(x_t_reg)
x_t_reg = sc.transform(x_t_reg)
x_test = sc.transform(x_test)
reg.fit(x_t_reg,y_t_reg)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=88, verbose=0, warm_start=False)

In [20]:
np.sqrt(np.mean((y_test - reg.predict(x_test))**2))

0.05097439465124597

In [27]:
np.sqrt(abs(gs.best_score_))


0.045833712825992814

In [28]:
pd.DataFrame(gs.cv_results_).sort_values("rank_test_score")



Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_max_depth,param_min_impurity_decrease,param_n_estimators,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
26,0.287764,0.011363,-0.002101,-0.001729,4,1e-07,200,"{'max_depth': 4, 'min_impurity_decrease': 1e-0...",1,-0.002001,-0.001769,-0.002041,-0.001732,-0.002261,-0.001685,0.005904,0.000946,0.000114,3.4e-05
30,0.28977,0.0117,-0.002101,-0.001729,4,1e-06,200,"{'max_depth': 4, 'min_impurity_decrease': 1e-0...",2,-0.002001,-0.001769,-0.002041,-0.001732,-0.002261,-0.001685,0.005369,0.001703,0.000114,3.4e-05
34,0.288433,0.011698,-0.002102,-0.001735,4,1e-05,200,"{'max_depth': 4, 'min_impurity_decrease': 1e-0...",3,-0.002003,-0.001773,-0.002043,-0.001738,-0.00226,-0.001693,0.003692,0.000473,0.000113,3.3e-05
15,0.392711,0.016377,-0.002102,-0.001881,3,1e-07,300,"{'max_depth': 3, 'min_impurity_decrease': 1e-0...",4,-0.001993,-0.001935,-0.002044,-0.001891,-0.00227,-0.001818,0.008282,0.001251,0.000121,4.8e-05
19,0.399062,0.015707,-0.002102,-0.001881,3,1e-06,300,"{'max_depth': 3, 'min_impurity_decrease': 1e-0...",5,-0.001993,-0.001935,-0.002044,-0.001891,-0.002271,-0.001818,0.006703,0.000474,0.000121,4.8e-05
23,0.406415,0.015708,-0.002102,-0.001882,3,1e-05,300,"{'max_depth': 3, 'min_impurity_decrease': 1e-0...",6,-0.001992,-0.001935,-0.002045,-0.001892,-0.00227,-0.001819,0.003308,0.001251,0.00012,4.8e-05
22,0.276068,0.01337,-0.002102,-0.001882,3,1e-05,200,"{'max_depth': 3, 'min_impurity_decrease': 1e-0...",7,-0.001994,-0.001936,-0.002043,-0.001892,-0.00227,-0.001818,0.007607,0.002364,0.00012,4.9e-05
14,0.285426,0.013036,-0.002103,-0.001881,3,1e-07,200,"{'max_depth': 3, 'min_impurity_decrease': 1e-0...",8,-0.001995,-0.001936,-0.002043,-0.001891,-0.002271,-0.001817,0.004121,0.00082,0.00012,4.9e-05
18,0.275065,0.011029,-0.002103,-0.001881,3,1e-06,200,"{'max_depth': 3, 'min_impurity_decrease': 1e-0...",9,-0.001995,-0.001936,-0.002043,-0.001891,-0.002271,-0.001817,0.006668,0.001418,0.00012,4.9e-05
25,0.220587,0.008035,-0.002103,-0.001726,4,1e-07,150,"{'max_depth': 4, 'min_impurity_decrease': 1e-0...",10,-0.002001,-0.00177,-0.002045,-0.001727,-0.002263,-0.001682,0.005369,2e-05,0.000115,3.6e-05
