In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
WHITE_WINE="white-wine.txt"
RED_WINE="red-wine.txt"

# Load both white and red whine datasets

In [3]:
white_df = pd.read_csv(WHITE_WINE, sep=";")

In [4]:
white_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [5]:
red_df = pd.read_csv(RED_WINE, sep=";")

In [6]:
red_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [7]:
white_df.shape

(4898, 12)

In [8]:
red_df.shape

(1599, 12)

In [9]:
white_df.rename(columns={'sulphates':'sulphites'}, inplace=True)
red_df.rename(columns={'sulphates':'sulphites'}, inplace=True)

# Add an extra column to both datasets

In [10]:
white_df["is white"] = 1

In [11]:
white_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphites,alcohol,quality,is white
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6,1
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6,1
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6,1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,1
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,1


In [12]:
red_df["is white"] = 0

In [13]:
red_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphites,alcohol,quality,is white
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0


# Rename columns on both datasets

In [14]:
white_df.columns = white_df.columns.str.strip().str.replace(' ', '_')

In [15]:
red_df.columns = red_df.columns.str.strip().str.replace(' ', '_')

# Concatenate vertically (i.e., stack vertically) the two datasets

In [16]:
whine_df = pd.concat([white_df, red_df], axis=0)

In [17]:
whine_df.shape

(6497, 13)

# Re-arrange columns

In [18]:
cols = whine_df.columns.tolist()

In [19]:
cols

['fixed_acidity',
 'volatile_acidity',
 'citric_acid',
 'residual_sugar',
 'chlorides',
 'free_sulfur_dioxide',
 'total_sulfur_dioxide',
 'density',
 'pH',
 'sulphites',
 'alcohol',
 'quality',
 'is_white']

In [20]:
cols = cols[:-2] + cols[-1:] + cols[-2:-1]

In [21]:
cols

['fixed_acidity',
 'volatile_acidity',
 'citric_acid',
 'residual_sugar',
 'chlorides',
 'free_sulfur_dioxide',
 'total_sulfur_dioxide',
 'density',
 'pH',
 'sulphites',
 'alcohol',
 'is_white',
 'quality']

In [22]:
whine_df = whine_df[cols]

In [23]:
whine_df.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphites,alcohol,is_white,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,1,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,1,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,1,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,1,6


In [24]:
whine_df.shape

(6497, 13)

# Save the whole dataset to a CSV file

In [25]:
whine_df.to_csv("dataset.csv", sep=",", index=False)

# Reload the just created CSV dataset file

In [26]:
whine_df = pd.read_csv("dataset.csv", sep=",")

In [27]:
whine_df.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphites,alcohol,is_white,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,1,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,1,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,1,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,1,6


# Extract the feature matrix ($X$) and the target vector ($y$)

In [28]:
X = whine_df.iloc[:, :-1].values
y = whine_df.iloc[:, -1].values

In [29]:
X.shape

(6497, 12)

In [30]:
y.shape

(6497,)

# Randomly split the whole dataset into training and test sets (80÷20)

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [32]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(5197, 12)
(5197,)
(1300, 12)
(1300,)


# Re-create the dataframe from the training set

In [33]:
train_df = pd.DataFrame(data=np.column_stack((X_train, y_train)), columns=whine_df.columns)

In [34]:
train_df.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphites,alcohol,is_white,quality
0,8.5,0.655,0.49,6.1,0.122,34.0,151.0,1.001,3.31,1.14,9.3,0.0,5.0
1,6.7,0.47,0.29,4.75,0.034,29.0,134.0,0.99056,3.29,0.46,13.0,1.0,7.0
2,5.8,0.19,0.24,1.3,0.044,38.0,128.0,0.99362,3.77,0.6,10.6,1.0,5.0
3,6.5,0.2,0.5,18.1,0.054,50.0,221.0,0.99941,2.94,0.64,8.8,1.0,6.0
4,5.6,0.605,0.05,2.4,0.073,19.0,25.0,0.99258,3.56,0.55,12.9,0.0,5.0


# Re-create the dataframe from the test set

In [35]:
test_df = pd.DataFrame(data=np.column_stack((X_test, y_test)), columns=whine_df.columns)

In [36]:
test_df.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphites,alcohol,is_white,quality
0,7.2,0.37,0.32,2.0,0.062,15.0,28.0,0.9947,3.23,0.73,11.3,0.0,7.0
1,7.3,0.36,0.34,14.8,0.057,46.0,173.0,0.99751,3.14,0.57,10.2,1.0,5.0
2,6.8,0.21,0.27,18.15,0.042,41.0,146.0,1.0001,3.3,0.36,8.7,1.0,5.0
3,5.7,0.46,0.46,1.4,0.04,31.0,169.0,0.9932,3.13,0.47,8.8,1.0,5.0
4,6.1,0.37,0.46,12.0,0.042,61.0,210.0,0.997,3.17,0.59,9.7,1.0,6.0


# Save both the training set and the test set

In [37]:
train_df.to_csv("train.csv", sep=",", index=False)

In [38]:
test_df.to_csv("test.csv", sep=",", index=False)

# Load training set

In [39]:
train = pd.read_csv("train.csv", sep=",")

# Load test set

In [40]:
test = pd.read_csv("test.csv", sep=",")

In [63]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

In [65]:
clf = RandomForestClassifier(n_estimators=5000, min_samples_leaf=10, verbose=1)
regr = RandomForestRegressor(n_estimators=5000, min_samples_leaf=10, verbose=1)

In [66]:
X_train = train.iloc[:, :-1].values
y_train = train.iloc[:, -1].values
X_test = test.iloc[:, :-1].values
y_test = test.iloc[:, -1].values

In [67]:
clf.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Done 5000 out of 5000 | elapsed:   21.5s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5000, n_jobs=1,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)

In [68]:
regr.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Done 5000 out of 5000 | elapsed:  1.1min finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=10, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=5000, n_jobs=1,
           oob_score=False, random_state=None, verbose=1, warm_start=False)

In [74]:
from sklearn.metrics import accuracy_score, matthews_corrcoef, r2_score, mean_squared_error

In [75]:
print("Accuracy (train): {:.5f}".format(accuracy_score(y_train, clf.predict(X_train))))
print("Accuracy (test): {:.5f}".format(accuracy_score(y_test, clf.predict(X_test))))
print("MCC (train): {:.5f}".format(matthews_corrcoef(y_train, clf.predict(X_train))))
print("MCC (test): {:.5f}".format(matthews_corrcoef(y_test, clf.predict(X_test))))

[Parallel(n_jobs=1)]: Done 5000 out of 5000 | elapsed:    2.9s finished


Accuracy (train): 0.74466


[Parallel(n_jobs=1)]: Done 5000 out of 5000 | elapsed:    0.9s finished


Accuracy (test): 0.61308


[Parallel(n_jobs=1)]: Done 5000 out of 5000 | elapsed:    2.8s finished


MCC (train): 0.60618
MCC (test): 0.39117


[Parallel(n_jobs=1)]: Done 5000 out of 5000 | elapsed:    1.1s finished


In [77]:
print("R2 (train): {:.5f}".format(r2_score(y_train, regr.predict(X_train))))
print("R2 (test): {:.5f}".format(r2_score(y_test, regr.predict(X_test))))
print("neg-MSE (train): {:.5f}".format(1 - mean_squared_error(y_train, regr.predict(X_train))))
print("neg-MSE (test): {:.5f}".format(1 - mean_squared_error(y_test, regr.predict(X_test))))

[Parallel(n_jobs=1)]: Done 5000 out of 5000 | elapsed:    2.0s finished


R2 (train): 0.64753


[Parallel(n_jobs=1)]: Done 5000 out of 5000 | elapsed:    0.6s finished


R2 (test): 0.45035


[Parallel(n_jobs=1)]: Done 5000 out of 5000 | elapsed:    1.9s finished


neg-MSE (train): 0.73135
neg-MSE (test): 0.58035


[Parallel(n_jobs=1)]: Done 5000 out of 5000 | elapsed:    0.6s finished


In [78]:
regr.feature_importances_

array([ 0.03871376,  0.16031849,  0.04118087,  0.06200637,  0.04570053,
        0.07637864,  0.0546349 ,  0.04279291,  0.04460858,  0.07128434,
        0.36116448,  0.00121612])

In [81]:
from operator import itemgetter

ranked_features = sorted([(i,x) for i,x in enumerate(regr.feature_importances_)], key=itemgetter(1), reverse=True)

In [84]:
top_5_features = [train.columns.tolist()[i] for i,x in ranked_features[:5]]

In [85]:
top_5_features

['alcohol',
 'volatile_acidity',
 'free_sulfur_dioxide',
 'sulphites',
 'residual_sugar']