In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

import seaborn as sns
import matplotlib.pyplot as plt

np.random.seed(1001)

datadir = Path('.').resolve().parents[0] / "data"
datadir



PosixPath('/home/mddevine/projects/ml-wine/data')

In [2]:
df = pd.read_csv(datadir / "interim" / "wine_df_nice_cols.csv")
print(f"{len(df)} observations.")
df.head()

178 observations.


Unnamed: 0,class,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280_over_od315,proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [17]:
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75
df['is_train'].value_counts()

True     142
False     36
Name: is_train, dtype: int64

In [18]:
train, test = df[df['is_train']].copy(), df[~df['is_train']].copy()
print(f"train = {len(train)}, test = {len(test)}")

train = 142, test = 36


In [19]:
# features == all columns except the class (our target), and `is_train`
features = df.columns[1:-1]
features

Index(['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium',
       'total_phenols', 'flavanoids', 'nonflavanoid_phenols',
       'proanthocyanins', 'color_intensity', 'hue', 'od280_over_od315',
       'proline'],
      dtype='object')

In [20]:
y = train['class']
print(f"dtype = {y.dtype}")
y.value_counts()

dtype = int64


2    55
1    48
3    39
Name: class, dtype: int64

In [21]:
# create the classifier
clf = RandomForestClassifier(n_jobs=2, random_state=1003)
clf.fit(train[features], y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
                       oob_score=False, random_state=1003, verbose=0,
                       warm_start=False)

In [22]:
# predict the test data
predictions = clf.predict(test[features])

# View the predicted probabilities of the first 10 observations
clf.predict_proba(test[features])[0:10]

array([[0.99, 0.01, 0.  ],
       [0.99, 0.01, 0.  ],
       [0.94, 0.02, 0.04],
       [0.98, 0.01, 0.01],
       [0.96, 0.03, 0.01],
       [0.98, 0.01, 0.01],
       [0.89, 0.07, 0.04],
       [1.  , 0.  , 0.  ],
       [0.99, 0.01, 0.  ],
       [0.99, 0.01, 0.  ]])

In [23]:
pd.crosstab(test['class'], predictions, rownames=['Actual Wine'], colnames=['Predicted Wine'])

Predicted Wine,1,2,3
Actual Wine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,11,0,0
2,0,15,1
3,0,0,9


In [24]:
def check_correct(x):
    if x['prediction'] == x['class']:
        return 1
    return 0


test['prediction'] = predictions
test['correct'] = test.apply(check_correct, axis=1)
print(f"accuracy = {round(test['correct'].mean(), 2)}")
test.head()

accuracy = 0.97


Unnamed: 0,class,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280_over_od315,proline,is_train,prediction,correct
7,1,14.06,2.15,2.61,17.6,121,2.6,2.51,0.31,1.25,5.05,1.06,3.58,1295,False,1,1
14,1,14.38,1.87,2.38,12.0,102,3.3,3.64,0.29,2.96,7.5,1.2,3.0,1547,False,1,1
17,1,13.83,1.57,2.62,20.0,115,2.95,3.4,0.4,1.72,6.6,1.13,2.57,1130,False,1,1
19,1,13.64,3.1,2.56,15.2,116,2.7,3.03,0.17,1.66,5.1,0.96,3.36,845,False,1,1
20,1,14.06,1.63,2.28,16.0,126,3.0,3.17,0.24,2.1,5.65,1.09,3.71,780,False,1,1


In [25]:
# View a list of the features and their importance scores
list(zip(train[features], clf.feature_importances_))

[('alcohol', 0.11573753929328291),
 ('malic_acid', 0.0344193496981277),
 ('ash', 0.020000397181156124),
 ('alcalinity_of_ash', 0.021893152983244388),
 ('magnesium', 0.034272450406577976),
 ('total_phenols', 0.059774410638180064),
 ('flavanoids', 0.1341931480517083),
 ('nonflavanoid_phenols', 0.01265683585127705),
 ('proanthocyanins', 0.018369219806036875),
 ('color_intensity', 0.16281628872584222),
 ('hue', 0.06673558936324626),
 ('od280_over_od315', 0.1382852067027089),
 ('proline', 0.18084641129861118)]