In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Data Generation by Rules

In [2]:
# Data description:
# 4 attributes (k = 4), number of data = 1000 (M = 1200)
#
# Absolutely 'right' Rules: 
# 1. If 0 <= time <= 5, then class = No
# 2. If outlook == Rainy, then class = No
# 3. If temperature < 9 or temperature > 32, then class = No
# 4. If temperature > 30 and humidity > 0.85, then class = No
# 5. If temperature < 12 and outlook == Windy, then class = No
# 6. Otherwise class = Yes

with open("./Data/data.csv", 'w', encoding = 'UTF-8') as f:
    M = 1200
    np.random.seed(0)
    demo_list = ['Rainy','Sunny','Cloudy','Windy','Overcast']
    outlook = np.random.choice(demo_list, size=M, p=[0.15,0.35,0.3,0.15,0.05])
    humidity = np.random.rand(M)
    temperature = np.random.randint(5, 36, size=M)
    time = np.random.randint(0, 24, size=M)
    f.write("outlook,humidity,temperature,time,hangingOut\n")
    for i in range(0, M):
        f.write("%s," % outlook[i])
        if outlook[i] == 'Rainy':
            humidity[i] = 1
        f.write("%.4f," % humidity[i])
        f.write("%d," % temperature[i])
        f.write("%d," % time[i])
        # Rules:
        if 0 <= time[i] <= 5:
            f.write("No\n")
        elif outlook[i] == 'Rainy':
            f.write("No\n")
        elif temperature[i] < 9 or temperature[i] > 32:
            f.write("No\n")
        elif temperature[i] > 30 and humidity[i] > 0.85:
            f.write("No\n")
        elif temperature[i] < 12 and outlook[i] == 'Windy':
            f.write("No\n")
        else:
            f.write("Yes\n")
    f.close()

In [3]:
dataset = pd.read_csv('./Data/data.csv')

# Data Preprocessing

In [4]:
dataset['outlook'] = LabelEncoder().fit_transform(dataset['outlook'])
dataset['hangingOut'] = LabelEncoder().fit_transform(dataset['hangingOut'])

X = dataset.drop('hangingOut', axis = 1)
y = dataset['hangingOut']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Decision Tree

In [5]:
# Decision-tree classifier
from sklearn.tree import DecisionTreeClassifier
DTC = DecisionTreeClassifier(random_state = 0, max_depth=6)
DTC.fit(X_train, y_train)
y_pred = DTC.predict(X_test)
print("Result:")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Result:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       169
           1       0.98      1.00      0.99       131

   micro avg       0.99      0.99      0.99       300
   macro avg       0.99      0.99      0.99       300
weighted avg       0.99      0.99      0.99       300

[[167   2]
 [  0 131]]


In [6]:
# Decision tree relationship graph
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(DTC, out_file=dot_data,
                class_names=['Yes','No'],
                feature_names=list(X_train),
                filled=True, rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())
graph.write_png("tree.png")

True

In [7]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  This is separate from the ipykernel package so we can avoid doing imports until


# SVC

In [8]:
# Kernel SVC
from sklearn.svm import SVC
SVC = SVC(kernel = 'rbf', random_state = 0)
SVC.fit(X_train, y_train)
y_pred = SVC.predict(X_test)
print("Kernel SVC:")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Kernel SVC:
              precision    recall  f1-score   support

           0       0.93      0.95      0.94       169
           1       0.93      0.91      0.92       131

   micro avg       0.93      0.93      0.93       300
   macro avg       0.93      0.93      0.93       300
weighted avg       0.93      0.93      0.93       300

[[160   9]
 [ 12 119]]


# Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier(n_estimators = 200, criterion = 'entropy', random_state = 0)
RFC.fit(X_train, y_train)
y_pred = RFC.predict(X_test)
print("Random Forest:")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Random Forest:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       169
           1       0.99      1.00      1.00       131

   micro avg       1.00      1.00      1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300

[[168   1]
 [  0 131]]


# Naive-bayes

In [10]:
from sklearn.naive_bayes import GaussianNB
NB = GaussianNB()
NB.fit(X_train, y_train)
y_pred = NB.predict(X_test)
print("Naive Bayes:")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Naive Bayes:
              precision    recall  f1-score   support

           0       0.83      0.82      0.82       169
           1       0.77      0.78      0.77       131

   micro avg       0.80      0.80      0.80       300
   macro avg       0.80      0.80      0.80       300
weighted avg       0.80      0.80      0.80       300

[[138  31]
 [ 29 102]]
