In [21]:
import pandas as pd
from collections import Counter
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Normalizer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import RandomForestClassifier

In [22]:
wdata = pd.read_csv('data/Dataset.csv')

In [32]:
def simplified_quality(q):
    
    if q <= 4:
        return 0
    elif q <= 6:
        return 1
    else:
        return 2

for style in 'red white'.split():
    
    X = wdata[wdata['style'] == style].drop(['style','quality'], axis=1)
    y = wdata[wdata['style'] == style]['quality']
    
    print(f'\n{style.upper()} wines: {len(X)} samples\n')
    
    print(f'ratings: {Counter(y).most_common()}')
    
    y = y.apply(simplified_quality)
    
    print(f'simplified ratings: {Counter(y).most_common()}')

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=471)

    pipe = make_pipeline(Normalizer(), PCA(), RandomForestClassifier(n_estimators=100, 
                                                                     class_weight='balanced'))

    parameters = {'randomforestclassifier__n_estimators': [50, 200, 400],
                 'randomforestclassifier__max_depth': [None, 2, 3, 4]}

    clf = GridSearchCV(pipe, parameters)
    clf.fit(X_train, y_train)

    yh = clf.predict(X_test)

    print(classification_report(y_test, yh))

    ranked_features = sorted([(f, im) for f, im in zip(X_train.columns, 
                 clf.best_estimator_.named_steps['randomforestclassifier'].feature_importances_)],
       key=lambda x: x[1], reverse=True)

    print(pd.DataFrame.from_records(ranked_features, columns=['feature', 'weight']))


RED wines: 1599 samples

ratings: [(5, 681), (6, 638), (7, 199), (4, 53), (8, 18), (3, 10)]
simplified ratings: [(1, 1319), (2, 217), (0, 63)]


  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        16
          1       0.90      0.97      0.93       338
          2       0.69      0.52      0.59        46

avg / total       0.84      0.88      0.85       400

                 feature    weight
0    free_sulfur_dioxide  0.249560
1   total_sulfur_dioxide  0.095539
2                density  0.079756
3            citric_acid  0.078828
4                     pH  0.078718
5          fixed_acidity  0.075543
6                alcohol  0.073263
7       volatile_acidity  0.071252
8              sulphates  0.067746
9         residual_sugar  0.066736
10             chlorides  0.063059

WHITE wines: 4898 samples

ratings: [(6, 2198), (5, 1457), (7, 880), (8, 175), (4, 163), (3, 20), (9, 5)]
simplified ratings: [(1, 3655), (2, 1060), (0, 183)]
             precision    recall  f1-score   support

          0       1.00      0.14      0.24        36
          1       0.85      0.97      0.90   