In [None]:
import pandas as pd
import graphviz
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score
from six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

In [None]:
data = pd.read_csv("winequality_red.csv")
data

In [None]:
data.describe()

In [None]:
X = data.drop(columns = 'quality')
y = data['quality']

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size = 0.30, random_state= 355)

In [None]:
#let's first visualize the tree on the data without doing any pre processing
clf = DecisionTreeClassifier()
clf.fit(x_train,y_train)

In [None]:
feature_name=list(X.columns)
class_name = list(y_train.unique())
feature_name

In [None]:
clf.score(x_train,y_train)

In [None]:
py_pred = clf.predict(x_test)

In [None]:
# accuracy of our classification tree
clf.score(x_test,y_test)

In [None]:
scalar = StandardScaler()

x_transform = scalar.fit_transform(X)

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x_transform,y,test_size = 0.30, random_state= 355)

In [None]:
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
pca = PCA()
principalComponents = pca.fit_transform(x_transform)
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Explained Variance')
plt.show()

In [None]:
pca = PCA(n_components=8)
new_data = pca.fit_transform(x_transform)

principal_x = pd.DataFrame(new_data,columns=['PC-1','PC-2','PC-3','PC-4','PC-5','PC-6','PC-7','PC-8'])

In [None]:
principal_x

In [None]:

x_train,x_test,y_train,y_test = train_test_split(principal_x,y,test_size = 0.30, random_state= 355)


clf = DecisionTreeClassifier()
clf.fit(x_train,y_train)
clf.score(x_test,y_test)

In [None]:

grid_param = {
    'criterion': ['gini', 'entropy'],
    'max_depth' : range(2,32,1),
    'min_samples_leaf' : range(1,10,1),
    'min_samples_split': range(2,10,1),
    'splitter' : ['best', 'random']
    
}

In [None]:
grid_search = GridSearchCV(estimator=clf,
                     param_grid=grid_param,
                     cv=5,
                    n_jobs =-1)

In [None]:
grid_search.fit(x_train,y_train)

In [None]:
best_parameters = grid_search.best_params_
print(best_parameters)

In [None]:
grid_search.best_score_

In [None]:
clf = DecisionTreeClassifier(criterion = 'entropy', max_depth =24,
                             min_samples_leaf= 1, min_samples_split= 2, splitter ='random')
clf.fit(x_train,y_train)

In [None]:
clf.score(x_test,y_test)

In [None]:
feature_name=list(X.columns)
class_name = list(y_train.unique())
# create a dot_file which stores the tree structure
dot_data = export_graphviz(clf,rounded = True,filled = True)
# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)  
#graph.write_png("tree.png")
# Show graph
Image(graph.create_png())

### Ensemble Learning

#### Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
from sklearn.datasets import load_breast_cancer
dataset = load_breast_cancer()

In [None]:
X = dataset.data
y = dataset.target

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3)

In [None]:
bgg_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500,
                           bootstrap=True, n_jobs=-1, oob_score=True)

In [None]:
bgg_clf.fit(X_train, y_train)

In [None]:
bgg_clf.oob_score_

In [None]:
from sklearn.metrics import accuracy_score

y_pred = bgg_clf.predict(X_test)
accuracy_score(y_test, y_pred)

### Random Forest

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score
from six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

In [None]:
data = pd.read_csv("winequality_red.csv")
data

In [None]:
X = data.drop(columns = 'quality')
y = data['quality']

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size = 0.30, random_state= 355)

In [None]:
rand_clf = RandomForestClassifier(random_state=6)

In [None]:
rand_clf.fit(x_train,y_train)

In [None]:
rand_clf.score(x_test,y_test)

In [None]:
grid_param = {
    "n_estimators" : [90,100,115,130],
    'criterion': ['gini', 'entropy'],
    'max_depth' : range(2,20,1),
    'min_samples_leaf' : range(1,10,1),
    'min_samples_split': range(2,10,1),
    'max_features' : ['auto','log2']
}

In [None]:
grid_search = GridSearchCV(estimator=rand_clf,param_grid=grid_param,
                           cv=5,n_jobs =-1,verbose = 3)

In [None]:
###grid_search.fit(x_train,y_train)

In [None]:
###grid_search.best_params_

In [None]:
from sklearn.ensemble import RandomForestClassifier

rand_clf = RandomForestClassifier(criterion= 'entropy',
 max_features = 'auto',
 min_samples_leaf = 1,
 min_samples_split= 4,
 n_estimators = 115,random_state=6)

In [None]:
rand_clf.fit(x_train,y_train)

In [None]:
rand_clf.score(x_test,y_test)