<a href="https://colab.research.google.com/github/chitinglow/Classification-Machine-learning/blob/master/supervised_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split

data = load_wine()

In [4]:
X = pd.DataFrame(data.data)
Y = pd.DataFrame(data.target)
print(X.shape, Y.shape)

(178, 13) (178, 1)


In [5]:
X, X_test, Y, Y_test = train_test_split(X, Y, test_size = 0.2)
print(X.shape, X_test.shape, Y.shape, Y_test.shape)

(142, 13) (36, 13) (142, 1) (36, 1)


In [8]:
dev_size = 36/142
X_train, X_dev, Y_train, Y_dev = train_test_split(X, Y, test_size = dev_size)

In [11]:
# Cross validation 
from sklearn.model_selection import KFold

X = pd.DataFrame(data.data)
Y = pd.DataFrame(data.target)
print(X.shape, Y.shape)

X, X_test, Y, Y_test = train_test_split(X, Y, test_size = 0.10)
kf = KFold(n_splits = 10)
splits = kf.split(X)

(178, 13) (178, 1)


In [12]:
for train_index, test_index in splits:
  X_train, X_dev = X.iloc[train_index,:], X.iloc[test_index,:]
  Y_train, Y_dev = Y.iloc[train_index,:], Y.iloc[test_index,:]

print(X_train.shape, Y_train.shape, X_dev.shape, Y_dev.shape, X_test.shape, Y_test.shape)

(144, 13) (144, 1) (16, 13) (16, 1) (18, 13) (18, 1)


In [14]:
# Evaluation metrics
from sklearn.datasets import load_breast_cancer
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

data = load_breast_cancer()
X = pd.DataFrame(data.data)
Y = pd.DataFrame(data.target)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1)

In [17]:
model = tree.DecisionTreeClassifier(random_state=0)
model = model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)

In [19]:
confusion_matrix(Y_test, Y_pred)

array([[20,  3],
       [ 0, 34]])

In [20]:
accuracy = accuracy_score(Y_test, Y_pred)
print(accuracy)

0.9473684210526315


In [21]:
precision = precision_score(Y_test, Y_pred)
print(precision)

0.918918918918919


In [23]:
recall = recall_score(Y_test, Y_pred)
print(recall)

1.0


In [24]:
# Evaluation for regression tasks
# MAE (the average absolute difference between a prediction and the ground truth)
# Root Mean squared error (quadratic mteric measures the average magnitude of error between the ground truth and predicition)

from sklearn.datasets import load_boston
from sklearn import linear_model
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import numpy as np

data = load_boston()
X = pd.DataFrame(data.data)
Y = pd.DataFrame(data.target)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1)

In [25]:
model = linear_model.LinearRegression()
model = model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)

In [26]:
MAE = mean_absolute_error(Y_test, Y_pred)
print(MAE)

3.932810579004414


In [28]:
RMSE = np.sqrt(mean_squared_error(Y_test, Y_pred))
print(RMSE)

4.980256324362114


In [29]:
# Error activity
data = load_breast_cancer()
X = pd.DataFrame(data.data)
Y = pd.DataFrame(data.target)

X_new, X_test, Y_new, Y_test = train_test_split(X, Y, test_size=0.1, random_state=101)
test_size = X_test.shape[0] / X_new.shape[0]
X_train, X_dev, Y_train, Y_dev = train_test_split(X_new, Y_new, test_size=test_size, random_state=101)

print(X_train.shape, Y_train.shape, X_dev.shape, Y_dev.shape, X_test.shape, Y_test.shape)

(455, 30) (455, 1) (57, 30) (57, 1) (57, 30) (57, 1)


In [30]:
np.random.seed(101)

indices_train = np.random.randint(0,len(X_train),25)
indices_dev = np.random.randint(0,len(X_dev),25)

X_train_dev = pd.concat([X_train.iloc[indices_train,:],X_dev.iloc[indices_dev,:]])
Y_train_dev = pd.concat([Y_train.iloc[indices_train,:],Y_dev.iloc[indices_dev,:]])

print(X_train_dev.shape, Y_train_dev.shape)

(50, 30) (50, 1)


In [31]:
model = tree.DecisionTreeClassifier(random_state=101)
model = model.fit(X_train, Y_train)

In [32]:

sets = ["Training", "Train/dev", "Validation", "Testing"]
X_sets = [X_train, X_train_dev, X_dev, X_test]
Y_sets = [Y_train, Y_train_dev, Y_dev, Y_test]

scores = {}
for i in range(0,len(X_sets)):
    pred = model.predict(X_sets[i])
    score = recall_score(Y_sets[i],pred)
    scores[sets[i]] = score

print(scores)

{'Training': 1.0, 'Train/dev': 0.9705882352941176, 'Validation': 0.9333333333333333, 'Testing': 0.9714285714285714}
