In [60]:
import pandas as pd
from collections import Counter
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Normalizer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import RandomForestClassifier

In [61]:
wdata = pd.read_csv('data/Dataset.csv')

In [62]:
Counter(wdata['style'])

Counter({'red': 1599, 'white': 4898})

In [63]:
reds = wdata[wdata['style'] == 'red']
print(f'{len(reds)} samples red')
whites = wdata[wdata['style'] == 'white']
print(f'{len(whites)} samples whites')

1599 samples red
4898 samples whites


In [64]:
X = reds.drop(['style','quality'], axis=1)
y = reds['quality']

In [65]:
X.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


In [66]:
RANDOM_STATE = 35
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.30,
                                                    random_state=RANDOM_STATE)

In [80]:
pipe = make_pipeline(Normalizer(), PCA(), RandomForestClassifier(n_estimators=100))

parameters = { 'randomforestclassifier__n_estimators':[50, 200, 400]}

clf = GridSearchCV(pipe, parameters)
clf.fit(X_train, y_train)

yh = clf.predict(X_test)

In [81]:
print(classification_report(y_test, yh))

             precision    recall  f1-score   support

          3       0.00      0.00      0.00         2
          4       0.00      0.00      0.00        17
          5       0.76      0.71      0.73       202
          6       0.61      0.77      0.68       189
          7       0.60      0.48      0.54        60
          8       1.00      0.10      0.18        10

avg / total       0.66      0.66      0.65       480



  'precision', 'predicted', average, warn_for)


In [82]:
ranked_features = sorted([(f, im) for f, im in zip(X_train.columns, 
                 clf.best_estimator_.named_steps['randomforestclassifier'].feature_importances_)],
       key=lambda x: x[1], reverse=True)

print(pd.DataFrame.from_records(ranked_features, columns=['feature', 'weight']))

                 feature    weight
0    free_sulfur_dioxide  0.170735
1          fixed_acidity  0.107551
2                density  0.088763
3            citric_acid  0.086219
4                     pH  0.082792
5       volatile_acidity  0.082006
6                alcohol  0.078099
7              sulphates  0.078004
8              chlorides  0.076266
9   total_sulfur_dioxide  0.075614
10        residual_sugar  0.073952
