In [12]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, roc_curve, auc, mean_squared_error
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, make_scorer
from sklearn.metrics import roc_auc_score
from sklearn.datasets import make_classification
%config IPCompleter.greedy=True
from sklearn.linear_model import LogisticRegression, Ridge, Lasso, LassoCV, RidgeCV, LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, RandomForestRegressor
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
import scipy
sns.set_style("whitegrid")
sns.set(rc={'figure.figsize': (12, 6)})
import warnings
warnings.filterwarnings('ignore')

In [4]:
from functools import reduce

texts = [['i', 'have', 'a', 'cat'],
         ['he', 'have', 'a', 'dog'],
         ['he', 'and', 'i', 'have', 'a', 'cat', 'and', 'a', 'dog']]

dictionary = list(enumerate(set(reduce(lambda x, y: x + y, texts))))
print(dictionary)
def vectorize(text):
    vector = np.zeros(len(dictionary))
    for i, word in dictionary:
        num = 0
        for w in text:
            if w == word:
                num += 1
        if num:
            vector[i] = num
    return vector

for t in texts:
    print(vectorize(t))

[(0, 'cat'), (1, 'dog'), (2, 'have'), (3, 'and'), (4, 'i'), (5, 'he'), (6, 'a')]
[1. 0. 1. 0. 1. 0. 1.]
[0. 1. 1. 0. 0. 1. 1.]
[1. 1. 1. 2. 1. 1. 2.]


In [5]:
##########################################################################################################

In [6]:
df = pd.read_csv("../files/winequality-white.csv")

In [7]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 459.3 KB


In [11]:
df.quality.unique()

array([6, 5, 7, 8, 4, 3, 9], dtype=int64)

In [13]:
X, y = df.drop("quality", axis=1), df["quality"]

X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=.7, random_state=17)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_holdout_scaled = scaler.transform(X_holdout)

In [26]:
linreg = LinearRegression()
lasso = Lasso(alpha=0.01, random_state=17)
linreg.fit(X_train_scaled, y_train)

LinearRegression()

In [25]:
Lasso?

In [28]:
print("Mean squared error (train): %s" %mean_squared_error(y_train, linreg.predict(X_train_scaled)))
print("Mean squared error (test): %s" %mean_squared_error(y_holdout, linreg.predict(X_holdout_scaled)))

Mean squared error (train): 0.5321779540981326
Mean squared error (test): 0.5835198458691795


In [21]:
pd.DataFrame(linreg.coef_, df.columns[:-1], columns=["coef"]).sort_values(by="coef",
                                                                          ascending=False, key=abs)

Unnamed: 0,coef
density,-0.674179
residual sugar,0.560499
volatile acidity,-0.192491
pH,0.151993
alcohol,0.140476
fixed acidity,0.100363
sulphates,0.065973
total sulfur dioxide,0.033827
citric acid,0.013249
chlorides,0.011563


In [22]:
def train_validate_report(model, X_train_scaled, y_train, X_valid_scaled,
                          y_valid, feature_names, forest=False):
    model.fit(X_train_scaled, y_train)
    print(f"MSE = {mean_squared_error(y_valid, model.predict(X_valid_scaled))}")
    print("Model coefficients:")
    coef = model.feature_importances_ if forest else model.coef_
    coef_name = "Importance" if forest else "Coef"
    print(pd.DataFrame(coef, feature_names, columns=[coef_name])
          .sort_values(by=coef_name, ascending=False))

In [27]:
train_validate_report(lasso, X_train_scaled, y_train, X_holdout_scaled,
                          y_holdout, X.columns, forest=False)

MSE = 0.5833770351564992
Model coefficients:
                          Coef
alcohol               0.343034
residual sugar        0.251695
pH                    0.063068
sulphates             0.032678
free sulfur dioxide   0.001119
fixed acidity        -0.000000
citric acid           0.000000
chlorides            -0.000000
total sulfur dioxide  0.000000
volatile acidity     -0.189060
density              -0.202199


In [31]:
lasso_cv = LassoCV(alphas=np.logspace(-6, 2, 200))
lasso_cv.fit(X_train_scaled, y_train)

LassoCV(alphas=array([1.00000000e-06, 1.09698580e-06, 1.20337784e-06, 1.32008840e-06,
       1.44811823e-06, 1.58856513e-06, 1.74263339e-06, 1.91164408e-06,
       2.09704640e-06, 2.30043012e-06, 2.52353917e-06, 2.76828663e-06,
       3.03677112e-06, 3.33129479e-06, 3.65438307e-06, 4.00880633e-06,
       4.39760361e-06, 4.82410870e-06, 5.29197874e-06, 5.80522552e-06,
       6.36824994e-06, 6.98587975e-0...
       1.18953407e+01, 1.30490198e+01, 1.43145894e+01, 1.57029012e+01,
       1.72258597e+01, 1.88965234e+01, 2.07292178e+01, 2.27396575e+01,
       2.49450814e+01, 2.73644000e+01, 3.00183581e+01, 3.29297126e+01,
       3.61234270e+01, 3.96268864e+01, 4.34701316e+01, 4.76861170e+01,
       5.23109931e+01, 5.73844165e+01, 6.29498899e+01, 6.90551352e+01,
       7.57525026e+01, 8.30994195e+01, 9.11588830e+01, 1.00000000e+02]))

In [33]:
lasso_cv.alpha_

0.0014992684327860455

In [34]:
pd.DataFrame(lasso_cv.coef_, df.columns[:-1], columns=["coef"]).sort_values(by="coef",
                                                                          ascending=False, key=abs)

Unnamed: 0,coef
density,-0.576951
residual sugar,0.497112
volatile acidity,-0.191091
alcohol,0.181289
pH,0.133112
fixed acidity,0.077289
sulphates,0.059832
total sulfur dioxide,0.023578
citric acid,0.011606
chlorides,0.005437


In [36]:
print("Mean squared error (train): %s" %mean_squared_error(y_train, lasso_cv.predict(X_train_scaled)))
print("Mean squared error (test): %s" %mean_squared_error(y_holdout, lasso_cv.predict(X_holdout_scaled)))

Mean squared error (train): 0.5324734177725579
Mean squared error (test): 0.581263236441152


In [52]:
forest = RandomForestRegressor(random_state=17)
forest.fit(X_train_scaled, y_train)

RandomForestRegressor(random_state=17)

In [66]:
X_train_scaled.shape

(1469, 11)

In [74]:
print("Mean squared error (train): %s" %mean_squared_error(y_train, forest.predict(X_train_scaled)))
print("Mean squared error (cv): %s" %np.mean(cross_val_score(forest,
                                                     pd.DataFrame(X_train_scaled), y_train,
                                                       scoring="neg_mean_squared_error")))
print("Mean squared error (test): %s" %mean_squared_error(y_holdout, forest.predict(X_holdout_scaled)))

Mean squared error (train): 0.0618717494894486
Mean squared error (cv): -0.4518899380093334
Mean squared error (test): 0.4884576844561097


In [75]:
forest_params = {'max_depth': list(range(10, 25)), 
                 'min_samples_leaf': list(range(1, 8)),
                 'max_features': list(range(6,12))}

locally_best_forest = GridSearchCV(forest, forest_params, n_jobs=-1)
locally_best_forest.fit(X_train_scaled, y_train)

GridSearchCV(estimator=RandomForestRegressor(random_state=17), n_jobs=-1,
             param_grid={'max_depth': [10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                                       20, 21, 22, 23, 24],
                         'max_features': [6, 7, 8, 9, 10, 11],
                         'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7]})

In [76]:
locally_best_forest.best_params_, locally_best_forest.best_score_

({'max_depth': 18, 'max_features': 6, 'min_samples_leaf': 1},
 0.41657886443444375)

In [78]:
best_task_forest = RandomForestRegressor(max_depth=19, max_features=7, min_samples_leaf=1)
best_task_forest.fit(X_train_scaled, y_train)

RandomForestRegressor(max_depth=19, max_features=7)

In [79]:
print("Mean squared error (train): %s" %mean_squared_error(y_train,
                                                           best_task_forest.predict(X_train_scaled)))
print("Mean squared error (cv): %s" %np.mean(cross_val_score(best_task_forest,
                                                     pd.DataFrame(X_train_scaled), y_train,
                                                       scoring="neg_mean_squared_error")))
print("Mean squared error (test): %s" %mean_squared_error(y_holdout,
                                                          best_task_forest.predict(X_holdout_scaled)))

Mean squared error (train): 0.062346627537603345
Mean squared error (cv): -0.45286932497789056
Mean squared error (test): 0.47391037234613004


In [81]:
pd.DataFrame(best_task_forest.feature_importances_, df.columns[:-1],
             columns=["coef"]).sort_values(by="coef",
                                            ascending=False, key=abs)

Unnamed: 0,coef
alcohol,0.217793
volatile acidity,0.112733
free sulfur dioxide,0.100427
density,0.093324
total sulfur dioxide,0.079202
pH,0.073412
residual sugar,0.073345
chlorides,0.06925
fixed acidity,0.062779
sulphates,0.06059
