# Training the dataset with SKLEARN

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [11]:
dataset=pd.read_csv("winequality-red.csv")
dataset["label"] = dataset.quality
dataset = dataset.drop("quality", axis=1)

column_names = []
for column in dataset.columns:
    name = column.replace(" ", "_")
    column_names.append(name)
dataset.columns = column_names
dataset.head()


Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,label
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [12]:
def transform_label(value):
    if value <= 5:
        return "bad"
    else:
        return "good"

dataset["label"] = dataset.label.apply(transform_label)

In [28]:
#devide the attributes to labels and data
X = dataset.iloc[:, 0:11].values
y = dataset.iloc[:, 11].values
# device the dataset in training and test data
random.seed(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

#Feature Normalization
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

#Initialize the classifier
classifier= RandomForestClassifier(n_estimators=100, random_state=0)
# Training 
classifier.fit(X_train, y_train)
# Testing
y_pred = classifier.predict(X_test)

In [29]:
print("SKLearn Random Forest Accuracy = {}".format(accuracy_score(y_test, y_pred)))

SKLearn Random Forest Accuracy = 0.821875


# Training with our Random Forest

In [3]:
import numpy as np
import pandas as pd
%matplotlib inline
import random
from pprint import pprint

from decision_tree_functions import decision_tree_algorithm, decision_tree_predictions
from helper_functions import train_test_split, calculate_accuracy

In [5]:
dataset=pd.read_csv("winequality-red.csv")
dataset["label"] = dataset.quality
dataset = dataset.drop("quality", axis=1)

column_names = []
for column in dataset.columns:
    name = column.replace(" ", "_")
    column_names.append(name)
dataset.columns = column_names

dataset.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,label
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [7]:
def transform_label(value):
    if value <= 5:
        return "bad"
    else:
        return "good"

dataset["label"] = dataset.label.apply(transform_label)

In [8]:
random.seed(0)
#Split the data
train_df, test_df = train_test_split(dataset, test_size=0.2)

def bootstrapping(train_df, n_bootstrap):
    bootstrap_indices = np.random.randint(low=0, high=len(train_df), size=n_bootstrap)
    df_bootstrapped = train_df.iloc[bootstrap_indices]
    
    return df_bootstrapped

def random_forest_algorithm(train_df, n_trees, n_bootstrap, n_features, dt_max_depth):
    forest = []
    for i in range(n_trees):
        df_bootstrapped = bootstrapping(train_df, n_bootstrap)
        tree = decision_tree_algorithm(df_bootstrapped, max_depth=dt_max_depth, random_subspace=n_features)
        forest.append(tree)
    
    return forest

def random_forest_predictions(test_df, forest):
    df_predictions = {}
    for i in range(len(forest)):
        column_name = "tree_{}".format(i)
        predictions = decision_tree_predictions(test_df, tree=forest[i])
        df_predictions[column_name] = predictions

    df_predictions = pd.DataFrame(df_predictions)
    random_forest_predictions = df_predictions.mode(axis=1)[0]
    
    return random_forest_predictions

In [30]:
forest = random_forest_algorithm(train_df, n_trees=20, n_bootstrap=800, n_features=2, dt_max_depth=4)
predictions = random_forest_predictions(test_df, forest)
accuracy = calculate_accuracy(predictions, test_df.label)

print("Our Random Forest Accuracy = {}".format(accuracy))

Our Random Forest Accuracy = 0.740625
