Tutorial from

https://elitedatascience.com/python-machine-learning-tutorial-scikit-learn#step-2

#### Import the relevant libraries

In [None]:
#efficient numerical computation
import numpy as np
#easy dataframe handling
import pandas as pd

#preprocessing: scaling, transforming and wrangling
from sklearn import preprocessing

#sampling helper and model performance evaluator
from sklearn.model_selection import train_test_split, GridSearchCV

#cross-validation help
from sklearn.pipeline import Pipeline, make_pipeline

#metrics for evaluating model performance
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

#To save our model for future use
from sklearn.externals import joblib #like pickle but more efficient for larger np arrays

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# The Data

http://archive.ics.uci.edu/ml/datasets/Wine

to see how we can maximise red wine quality

In [None]:
dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(dataset_url, sep=';')
data.head()

#### Our task is to predict wine quality using the following attributes: 
'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates' and 'alcohol'

# Exploratory Data analysis

In [None]:
y = data.quality     #targets column
X = data.drop('quality', axis=1)  #features 

print('\nOur data has %d observations and %d features' %(X.shape[0], X.shape[1]))
display(data.head())

print('\nThere are', y.nunique(), 'Unique values for quality, namely:', sorted(y.unique())) 


In [None]:
dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(dataset_url, sep=';')
data.head()

y = data.quality     #targets column
X = data.drop('quality', axis=1)  #features 

print('\nOur data has %d observations and %d features' %(X.shape[0], X.shape[1]))
display(data.head())

print('\nThere are', y.nunique(), 'Unique values for quality, namely:', sorted(y.unique())) 


### Let's visualise our data

Let's say wines with quality $>= 7$ are good quality, 

$5 \leq$ quality $< 6 \implies$ average quality 

and quality $< 5$ is bad quality 

In [None]:
vgq = data[y>6]  #top tier quality
aq = data[(y>=5) & (y<=6)]  #average quality
bq = data[y<5]   #bad quality

print('%.2f %% of the wines are of top tier quality' %(100*len(vgq)/len(y)))
print('%.2f %% of the wines are of average quality' %(100*len(aq)/len(y)))
print('%.2f %% of the wines are below average quality' %(100*len(bq)/len(y)))

In [None]:
sns.distplot(y, norm_hist=False, kde=False)
plt.title('Wine quality feature distribution')
plt.ylabel('Number of observations')
plt.show()

Most wines are of average quality, less than half are above average while even fewer are below average

Let's separate the wines of quality $\geq 7$ from the rest of the wines. 
We will be building classifiers for the very good wine quality only.

## Correlations between the features

In [None]:
#pd.plotting.scatter_matrix(data, alpha = 0.3, figsize = (40,40), diagonal = 'kde');



In [None]:
correlation = data.corr()
plt.figure(figsize=(14,12))
plt.title('Correlation')
#sns.heatmap(correlation, annot=True, linewidths=0, vmin=-1, cmap="RdBu_r")
#plt.show()

Red squares: positive correlation, increase in one $\implies$ increase in the other

Blue squares: negative correlation, increase in one $\implies$ decrease in the other

# Preprocessing
## Split data into training and test sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,
                                                   random_state=0,
                                                   stratify=y)
# 5: Declare data processing steps

#columns with missing data
print(data.isnull().any())
#cols_with_missing = [col for col in X.columns if X[col].isnull().any()]
#No missing data

#Categorical values?
cat = [cname for cname in X.columns if X[cname].dtype == 'object']
print('categorical columns\n', cat)
#no categorical variables

#numerical data?
num = [cname for cname in X.columns if X[cname].dtype in ['int64','float64']]
print(num == X.columns)
#All columns contain numerical data

#preprocessing will entail standardizing

In [None]:
y = data.quality     #targets column
X = data.drop('quality', axis=1)  #features 

print('\nOur data has %d observations and %d features' %(X.shape[0], X.shape[1]))
display(data.head())

print('\nThere are', y.nunique(), 'Unique values for quality, namely:', sorted(y.unique())) 

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,
                                                   random_state=0,
                                                   stratify=y)
# 5: Declare data processing steps

#columns with missing data
print(data.isnull().any())
#cols_with_missing = [col for col in X.columns if X[col].isnull().any()]
#No missing data

#Categorical values?
cat = [cname for cname in X.columns if X[cname].dtype == 'object']
print('categorical columns\n', cat)
#no categorical variables

#numerical data?
num = [cname for cname in X.columns if X[cname].dtype in ['int64','float64']]
print(num == X.columns)
#All columns contain numerical data

#preprocessing will entail standardizing

* All the features are numeric (convenient!)

* They have different scales (We should standardize them later)

# Models

### Create RandomForestRegressor pipeline

In [None]:
y = data.quality     #targets column
X = data.drop('quality', axis=1)  #features 

print('\nOur data has %d observations and %d features' %(X.shape[0], X.shape[1]))
display(data.head())

print('\nThere are', y.nunique(), 'Unique values for quality, namely:', sorted(y.unique())) 

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,
                                                   random_state=0,
                                                   stratify=y)
# 5: Declare data processing steps

#columns with missing data
print(data.isnull().any())
#cols_with_missing = [col for col in X.columns if X[col].isnull().any()]
#No missing data

#Categorical values?
cat = [cname for cname in X.columns if X[cname].dtype == 'object']
print('categorical columns\n', cat)
#no categorical variables

#numerical data?
num = [cname for cname in X.columns if X[cname].dtype in ['int64','float64']]
print(num == X.columns)
#All columns contain numerical data

#preprocessing will entail standardizing

### Let's try tpot

ref: https://github.com/srivatsan88/TPOT/blob/master/TPOT.ipynb

In [2]:
#efficient numerical computation
import numpy as np
#easy dataframe handling
import pandas as pd

#preprocessing: scaling, transforming and wrangling
from sklearn import preprocessing

#sampling helper and model performance evaluator
from sklearn.model_selection import train_test_split, GridSearchCV

#cross-validation help
from sklearn.pipeline import Pipeline, make_pipeline

#metrics for evaluating model performance
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

#To save our model for future use
from sklearn.externals import joblib #like pickle but more efficient for larger np arrays

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(dataset_url, sep=';')
data.head()
y = data.quality     #targets column
X = data.drop('quality', axis=1)  #features 

print('\nOur data has %d observations and %d features' %(X.shape[0], X.shape[1]))
display(data.head())

print('\nThere are', y.nunique(), 'Unique values for quality, namely:', sorted(y.unique())) 

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,
                                                   random_state=0,
                                                   stratify=y)
# 5: Declare data processing steps

#columns with missing data
print(data.isnull().any())
#cols_with_missing = [col for col in X.columns if X[col].isnull().any()]
#No missing data

#Categorical values?
cat = [cname for cname in X.columns if X[cname].dtype == 'object']
print('categorical columns\n', cat)
#no categorical variables

#numerical data?
num = [cname for cname in X.columns if X[cname].dtype in ['int64','float64']]
print(num == X.columns)
#All columns contain numerical data

#preprocessing will entail standardizing


Our data has 1599 observations and 11 features


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5



There are 6 Unique values for quality, namely: [3, 4, 5, 6, 7, 8]
fixed acidity           False
volatile acidity        False
citric acid             False
residual sugar          False
chlorides               False
free sulfur dioxide     False
total sulfur dioxide    False
density                 False
pH                      False
sulphates               False
alcohol                 False
quality                 False
dtype: bool
categorical columns
 []
[ True  True  True  True  True  True  True  True  True  True  True]


# References:
https://www.freecodecamp.org/news/using-data-science-to-understand-what-makes-wine-taste-good-669b496c67ee/

