Decision Trees 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

df = pd.read_csv('data/autoimmune.txt', delimiter="\t",header=None)
df=df.transpose()

df.columns=['Age','Blood_Pressure','BMI','Plasma_level','Autoimmune_Disease','Adverse_events','Drug_in_serum','Liver_function','Activity_test','Secondary_test']
df.index = np.arange(1, len(df) + 1)

X = df.drop('Autoimmune_Disease',axis=1)
y = df['Autoimmune_Disease']

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
scaled_features = scaler.transform(X)
df_feat = pd.DataFrame(scaled_features,columns=df.columns[:-1])

X_train, X_test, y_train, y_test = train_test_split(scaled_features,df['Autoimmune_Disease'],
                                                    test_size=0.30)

error_rate = []
for i in range(1,40):
    
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))
print (error_rate)
plt.figure(figsize=(10,6))
plt.plot(range(1,40),error_rate,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')


dtree = KNeighborsClassifier(n_neighbors=20)

kf = KFold(n_splits=10, shuffle=False, random_state=None)

acc=[]
for train_index, test_index in kf.split(X):
    X_train=scaler.transform(X.iloc[train_index])
    y_train=y.iloc[train_index]
    X_test=scaler.transform(X.iloc[test_index])
    y_true=y.iloc[test_index]
    dtree.fit(X_train,y_train)
    y_pred = dtree.predict(X_test)
    acc.append(accuracy_score(y_true, y_pred))

print (sum(acc)/len(acc))
    
scores = cross_val_score(dtree, X, y, cv=10)

print (sum(scores)/len(scores))

[0.2831858407079646, 0.2831858407079646, 0.2831858407079646, 0.2920353982300885, 0.25663716814159293, 0.26548672566371684, 0.25663716814159293, 0.26548672566371684, 0.23893805309734514, 0.25663716814159293, 0.24778761061946902, 0.26548672566371684, 0.24778761061946902, 0.25663716814159293, 0.2831858407079646, 0.2743362831858407, 0.2743362831858407, 0.2920353982300885, 0.2920353982300885, 0.2920353982300885, 0.3008849557522124, 0.3008849557522124, 0.2831858407079646, 0.2831858407079646, 0.2831858407079646, 0.2920353982300885, 0.25663716814159293, 0.2743362831858407, 0.26548672566371684, 0.3008849557522124, 0.2743362831858407, 0.2920353982300885, 0.2920353982300885, 0.2831858407079646, 0.26548672566371684, 0.2743362831858407, 0.26548672566371684, 0.3008849557522124, 0.25663716814159293]
0.7711948790896159
0.7416666666666668
