# 1. Training and Visualizing a Decision Tree

In [2]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [3]:
iris = load_iris()
train_X,test_X,train_y,test_y = train_test_split(iris.data,iris.target,test_size=0.3, random_state=42)
clf1 = DecisionTreeClassifier(random_state=2).fit(train_X,train_y)
clf2 = DecisionTreeClassifier(criterion = "entropy",random_state=2).fit(train_X,train_y)

predict_test = clf1.predict(test_X)
predict_test = pd.DataFrame(predict_test)
test_y = pd.DataFrame(test_y)
pd_all = predict_test.join(test_y, lsuffix='_test_y', rsuffix='_predict_test')
pd_all

correct_amount_km = pd_all[pd_all["0_test_y"] == pd_all["0_predict_test"]].shape[0]
print(correct_amount_km)
print("The Decision Trees Accuracy when criterion = Gini is {0} % ".format(correct_amount_km / pd_all.shape[0] * 100))

45
The Decision Trees Accuracy when criterion = Gini is 100.0 % 


In [4]:
predict_test = clf2.predict(test_X)
predict_test = pd.DataFrame(predict_test)
test_y = pd.DataFrame(test_y)
pd_all = predict_test.join(test_y, lsuffix='_test_y', rsuffix='_predict_test')
pd_all

correct_amount_km = pd_all[pd_all["0_test_y"] == pd_all["0_predict_test"]].shape[0]
print(correct_amount_km)
print("The Decision Trees Accuracy when criterion = Entropy is {0} % ".format(correct_amount_km / pd_all.shape[0] * 100))

45
The Decision Trees Accuracy when criterion = Entropy is 100.0 % 


# 2. Decision Trees parameters

In [6]:
'''
We find that when Criterion takes the entropy and Gini, the accuracy is the same, and we choose Gini because we don't have to calculate the log index, which reduces the computer memory usage and makes the calculation a bit faster
'''

"\nWe find that when Criterion takes the entropy and Gini, the accuracy is the same, and we choose Gini because we don't have to calculate the log index, which reduces the computer memory usage and makes the calculation a bit faster\n"

# 3. Visualize and save the Tree

In [8]:
from six import StringIO
from sklearn.tree import export_graphviz
from sklearn import tree
import pydot
dot_data = StringIO()
export_graphviz(clf1,out_file=dot_data)

#Gini Visualization
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph[0].write_pdf("Gini_iris.pdf")



In [9]:
from six import StringIO
from sklearn.tree import export_graphviz
from sklearn import tree
import pydot
dot_data = StringIO()
export_graphviz(clf2,out_file=dot_data)

#Entropy Visualization
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph[0].write_pdf("Entropy_iris.pdf")


# 4. Estimating Class Probabilities

In [11]:
print(clf1.predict_proba(test_X))

[[0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]


# 5. Regressionda 

In [13]:
#Decision trees
from sklearn.datasets import load_diabetes 
from sklearn.tree import DecisionTreeRegressor
diabetes = load_diabetes()
train_X,test_X,train_y,test_y = train_test_split(diabetes.data,diabetes.target,test_size=0.3, random_state=42)


regressor1 = DecisionTreeRegressor(random_state=0).fit(train_X,train_y)
result= regressor1.predict(test_X)
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np 
print(f"MSE：{mean_squared_error(result, test_y)}")

MSE：5838.571428571428


In [14]:
#Linear Regression
from sklearn.linear_model import LinearRegression
regressor2 = LinearRegression().fit(train_X,train_y)
result= regressor2.predict(test_X)
print(f"MSE：{mean_squared_error(result, test_y)}")
print("The smaller the result, the better,So linear Regression is better")

MSE：2821.7385595843766
The smaller the result, the better,So linear Regression is better


# 7. Support Vector Machine

In [16]:
#without StandardScaler
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

#split
train_X,test_X,train_y,test_y = train_test_split(iris.data,iris.target,test_size=0.4, random_state=42)

#mode
svm1 = SVC().fit(train_X,train_y) 

#predict
predict_test = svm1.predict(test_X)
predict_test = pd.DataFrame(predict_test)
test_y = pd.DataFrame(test_y)
pd_all = predict_test.join(test_y, lsuffix='_test_y', rsuffix='_predict_test')
correct_amount_km = pd_all[pd_all["0_test_y"] == pd_all["0_predict_test"]].shape[0]

#accuracy
print(correct_amount_km)
print("The SVM without StandardScaler Accuracy is {0} % ".format(correct_amount_km / pd_all.shape[0] * 100))

60
The SVM without StandardScaler Accuracy is 100.0 % 


In [17]:
#with StandardScaler
scaler = StandardScaler().fit(iris.data)
data_scale= scaler.transform(iris.data)

#split
train_X,test_X,train_y,test_y = train_test_split(data_scale,iris.target,test_size=0.4, random_state=42)

#mode
svm1 = SVC().fit(train_X,train_y) 

#predict
predict_test = svm1.predict(test_X)
predict_test = pd.DataFrame(predict_test)
test_y = pd.DataFrame(test_y)
pd_all = predict_test.join(test_y, lsuffix='_test_y', rsuffix='_predict_test')
correct_amount_km = pd_all[pd_all["0_test_y"] == pd_all["0_predict_test"]].shape[0]

#accuracy
print(correct_amount_km)
print("The SVM with StandardScaler Accuracy is {0} % ".format(correct_amount_km / pd_all.shape[0] * 100))

59
The SVM with StandardScaler Accuracy is 98.33333333333333 % 


In [18]:
'''
In order to remove the effect of unit and scale differences between features, so that each dimensional feature is treated equally, the features need to be normalised.
However, we found that after normalisation, the accuracy rate became smaller instead, which I suspect is due to the occurrence of over-fitting.
The original data had different units and large gaps between features, which would make the data more scattered.
After normalisation, the gap between features becomes smaller and overfitting can easily occur.
'''

'\nIn order to remove the effect of unit and scale differences between features, so that each dimensional feature is treated equally, the features need to be normalised.\nHowever, we found that after normalisation, the accuracy rate became smaller instead, which I suspect is due to the occurrence of over-fitting.\nThe original data had different units and large gaps between features, which would make the data more scattered.\nAfter normalisation, the gap between features becomes smaller and overfitting can easily occur.\n'

# 8. Kernel trick.

In [19]:


#rbf
svm1 = SVC(kernel = 'rbf').fit(train_X,train_y)
predict_test = svm1.predict(test_X)
predict_test = pd.DataFrame(predict_test)
pd_all = predict_test.join(test_y, lsuffix='_test_y', rsuffix='_predict_test')
correct_amount_km = pd_all[pd_all["0_test_y"] == pd_all["0_predict_test"]].shape[0]
print(correct_amount_km)
print("The SVC(rbf) Accuracy is {0} % ".format(correct_amount_km / pd_all.shape[0] * 100))

#linear
svm2 = SVC(kernel = 'linear').fit(train_X,train_y)
predict_test = svm2.predict(test_X)
predict_test = pd.DataFrame(predict_test)
pd_all = predict_test.join(test_y, lsuffix='_test_y', rsuffix='_predict_test')
correct_amount_km = pd_all[pd_all["0_test_y"] == pd_all["0_predict_test"]].shape[0]
print(correct_amount_km)
print("The SVC(linear) Accuracy is {0} % ".format(correct_amount_km / pd_all.shape[0] * 100))

#poly
svm3 = SVC(kernel = 'poly').fit(train_X,train_y)
predict_test = svm3.predict(test_X)
predict_test = pd.DataFrame(predict_test)
pd_all = predict_test.join(test_y, lsuffix='_test_y', rsuffix='_predict_test')
correct_amount_km = pd_all[pd_all["0_test_y"] == pd_all["0_predict_test"]].shape[0]
print(correct_amount_km)
print("The SVC(poly) Accuracy is {0} % ".format(correct_amount_km / pd_all.shape[0] * 100))

#sigmoid
svm4 = SVC(kernel = 'sigmoid').fit(train_X,train_y)
predict_test = svm4.predict(test_X)
predict_test = pd.DataFrame(predict_test)
pd_all = predict_test.join(test_y, lsuffix='_test_y', rsuffix='_predict_test')
correct_amount_km = pd_all[pd_all["0_test_y"] == pd_all["0_predict_test"]].shape[0]
print(correct_amount_km)
print("The SVC(sigmoid) Accuracy is {0} % ".format(correct_amount_km / pd_all.shape[0] * 100))

59
The SVC(rbf) Accuracy is 98.33333333333333 % 
58
The SVC(linear) Accuracy is 96.66666666666667 % 
58
The SVC(poly) Accuracy is 96.66666666666667 % 
55
The SVC(sigmoid) Accuracy is 91.66666666666666 % 


In [20]:
#We can see that the highest accuracy is achieved when the rbf parameter is used.