# Machine Learning Code for Wine Quality Project

## Load in appropriate libraries

In [15]:
# load in Pandas library for data maniuplation and conversion of CSV data into dataframe
import pandas as pd


## Original CSV dataset was obtained from Kaggle.  For the purposes of this project, the dataset was reduced from its original size to fit the training portion of the program. No further modifications were made to this initial dataset

In [16]:
#load in CSV file and convert to dataframe
file_path = 'winequality_training_data2.csv'
wine_df = pd.read_csv(file_path)

#review file to ensure it loaded properly
wine_df.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,6.2,0.45,0.26,4.4,0.063,63.0,206.0,0.994,3.27,0.52,9.8,4
1,white,9.8,0.36,0.46,10.5,,4.0,83.0,0.9956,2.89,0.3,10.1,4
2,white,5.5,0.485,0.0,1.5,0.065,8.0,103.0,0.994,3.63,0.4,9.7,4
3,white,6.4,0.595,0.14,5.2,0.058,15.0,97.0,0.9951,3.38,0.36,9.0,4
4,white,7.6,0.48,0.37,0.8,0.037,4.0,100.0,0.9902,3.03,0.39,11.4,4


## Clean up csv file using Python commands.  When project is fully in production, this portion of the data cleanup will actually be completed in the database portion of the project

In [17]:
#drop null rows
wine_df = wine_df.dropna()

#drop any null columns
wine_df = wine_df.dropna(axis='columns', how='all')

## Import train_test_split model for testing purposes
## Create features to be used by model

In [18]:
# Import train_test_split
from sklearn.model_selection import train_test_split

#create features
X = wine_df.copy()
X = pd.get_dummies(X, columns=['type'])
X = X.drop(columns='quality')

#create target
y = wine_df['quality']

#split up the data
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

## Multiple potential models were attempted in order to identify the one that provided the most accurate results and could produce the results in a way that fit with the plan for visualization and the ultimate goal of being able to predict wine quality based on certain categories.  Ultimately, RandomForestClassifier produced the most desirable results.

In [19]:
#import RandomForestClassifier to create training model
from sklearn.ensemble import RandomForestClassifier

# Create model
rf_model = RandomForestClassifier()
rf_model = rf_model.fit(X_train, y_train)

## Determine the importance of features for inclusion in visualization and for use in future quality determinations/modifications.  Create output file of importances for use in data visualization

In [20]:
#determine importances
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.12064402752418592, 'alcohol'),
 (0.10417632595704797, 'sulphates'),
 (0.09896278413748126, 'volatile acidity'),
 (0.09580986375607455, 'fixed acidity'),
 (0.09156154933449392, 'density'),
 (0.08956027567415381, 'pH'),
 (0.08531586276268083, 'free sulfur dioxide'),
 (0.08218526982721545, 'chlorides'),
 (0.07991758711871004, 'residual sugar'),
 (0.07656047769093484, 'total sulfur dioxide'),
 (0.060949860614774726, 'citric acid'),
 (0.009417867271826414, 'type_white'),
 (0.004938248330420328, 'type_red')]

In [41]:
#convert importances to a dataframe
importances_df = pd.DataFrame(importances, columns=['Importance_Score'])

In [42]:
#save categories to an array
categories = X.columns

In [43]:
#insert categories column into dataframe
importances_df.insert(0,'Categories','')
importances_df['Categories'] = categories

In [44]:
#sort by importance score
mportances_df = importances_df.sort_values(by=['Importance_Score'], ascending=False)
importances_df.head()

Unnamed: 0,Categories,Importance_Score
10,alcohol,0.120644
9,sulphates,0.104176
1,volatile acidity,0.098963
0,fixed acidity,0.09581
7,density,0.091562


In [45]:
#export results for data visualization
importances_df.to_csv('importance_weights.csv', index=False)

## Test model against dataset that does not have quality scores already calculated

In [7]:
#read in testing dataset and load into new dataframe
file_path = 'winequality-sample-set.csv'
test_wine_df = pd.read_csv(file_path)

In [8]:
#create sample set factors
X_sample_set = test_wine_df.copy()

#check sample set
X_sample_set.head()

Unnamed: 0,wine_id,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,W-001,white,7.0,0.27,0.36,20.7,0.045,45,170,1.001,3.0,0.45,8.8
1,W-002,white,6.3,0.3,0.34,1.6,0.049,14,132,0.994,3.3,0.49,9.5
2,W-003,white,8.1,0.28,0.4,6.9,0.05,30,97,0.9951,3.26,0.44,10.1
3,W-004,white,7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9
4,W-005,white,7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9


In [9]:
#extract wine_id column for indentification later; not used in the model
y = X_sample_set['wine_id']

#drop column from sample set
X_sample_set = X_sample_set.drop(columns='wine_id')

In [10]:
#modify data to reflect structure of model
X_sample_set = pd.get_dummies(X_sample_set, columns=['type'])


In [11]:
#calculate predicted wine quality for sample set
predictions = rf_model.predict(X_sample_set)

In [12]:
#create final dataframe with calculated data
final_set = X_sample_set

#add column for calculated quality predictions
final_set['quality'] = predictions

#add wine_id column to final data set
final_set.insert(0, 'wine_id','')
final_set['wine_id'] = y

#show final set
final_set.head()

Unnamed: 0,wine_id,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type_red,type_white,quality
0,W-001,7.0,0.27,0.36,20.7,0.045,45,170,1.001,3.0,0.45,8.8,0,1,6
1,W-002,6.3,0.3,0.34,1.6,0.049,14,132,0.994,3.3,0.49,9.5,0,1,4
2,W-003,8.1,0.28,0.4,6.9,0.05,30,97,0.9951,3.26,0.44,10.1,0,1,5
3,W-004,7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,0,1,5
4,W-005,7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,0,1,5


In [14]:
final_set.to_csv('wine_outcome.csv', index=False)