## Edgar Galindo


In [1]:
#importing libraries

import pandas as pd
import numpy as np
import html5lib
import requests
import re
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier


#### Importing the data set with read_table with a blank space separator and setting header to none

In [30]:
data_set_url = ('https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/reprocessed.hungarian.data')
df = pd.read_table(data_set_url, header=None, sep=' ')

In [31]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,40.0,1.0,2.0,140.0,289.0,0.0,0.0,172.0,0.0,0.0,-9.0,-9.0,-9.0,0.0
1,49.0,0.0,3.0,160.0,180.0,0.0,0.0,156.0,0.0,1.0,2.0,-9.0,-9.0,1.0
2,37.0,1.0,2.0,130.0,283.0,0.0,1.0,98.0,0.0,0.0,-9.0,-9.0,-9.0,0.0
3,48.0,0.0,4.0,138.0,214.0,0.0,0.0,108.0,1.0,1.5,2.0,-9.0,-9.0,3.0
4,54.0,1.0,3.0,150.0,-9.0,0.0,0.0,122.0,0.0,0.0,-9.0,-9.0,-9.0,0.0


In [32]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/heart-disease.names'

#### Reading in the URL and converting it to text.

In [33]:
webpage = requests.get(url)
webpage_content = webpage.text

#### Using regular expression to search and return features and label names.

In [34]:
features = re.findall(r'\#\d+\s+\(+(\w+)', webpage_content)
# finds '#' + digit + spaces + '(' + word only returns word
print(features)

# setting the column names
df.columns = features

['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num']


In [35]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,40.0,1.0,2.0,140.0,289.0,0.0,0.0,172.0,0.0,0.0,-9.0,-9.0,-9.0,0.0
1,49.0,0.0,3.0,160.0,180.0,0.0,0.0,156.0,0.0,1.0,2.0,-9.0,-9.0,1.0
2,37.0,1.0,2.0,130.0,283.0,0.0,1.0,98.0,0.0,0.0,-9.0,-9.0,-9.0,0.0
3,48.0,0.0,4.0,138.0,214.0,0.0,0.0,108.0,1.0,1.5,2.0,-9.0,-9.0,3.0
4,54.0,1.0,3.0,150.0,-9.0,0.0,0.0,122.0,0.0,0.0,-9.0,-9.0,-9.0,0.0



#### Dropping rows with missing values using dropna

In [36]:
#this is just to check how many missing values each column has and found it was the last entry

d = list(map(lambda x: df[x].isnull().sum(), df))
for (col, tot) in zip(features, d):
    print(col, ': ', tot)
df.tail()

age :  1
sex :  1
cp :  1
trestbps :  1
chol :  1
fbs :  1
restecg :  1
thalach :  1
exang :  1
oldpeak :  1
slope :  1
ca :  1
thal :  1
num :  1


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
290,36.0,1.0,2.0,120.0,166.0,0.0,0.0,180.0,0.0,0.0,-9.0,-9.0,-9.0,0.0
291,48.0,1.0,3.0,110.0,211.0,0.0,0.0,138.0,0.0,0.0,-9.0,-9.0,6.0,0.0
292,47.0,0.0,2.0,140.0,257.0,0.0,0.0,135.0,0.0,1.0,1.0,-9.0,-9.0,0.0
293,53.0,1.0,4.0,130.0,182.0,0.0,0.0,148.0,0.0,0.0,-9.0,-9.0,-9.0,0.0
294,,,,,,,,,,,,,,


In [37]:
print(df.shape)
df = df.dropna(axis=0, how='any')
print(df.shape)
df.tail()

(295, 14)
(294, 14)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
289,48.0,0.0,2.0,-9.0,308.0,0.0,1.0,-9.0,-9.0,2.0,1.0,-9.0,-9.0,0.0
290,36.0,1.0,2.0,120.0,166.0,0.0,0.0,180.0,0.0,0.0,-9.0,-9.0,-9.0,0.0
291,48.0,1.0,3.0,110.0,211.0,0.0,0.0,138.0,0.0,0.0,-9.0,-9.0,6.0,0.0
292,47.0,0.0,2.0,140.0,257.0,0.0,0.0,135.0,0.0,1.0,1.0,-9.0,-9.0,0.0
293,53.0,1.0,4.0,130.0,182.0,0.0,0.0,148.0,0.0,0.0,-9.0,-9.0,-9.0,0.0


In [38]:
# Splitting my dataset into features and label
features.pop()
x = df[features]
y = df['num']
print(x.shape)
print(y.shape)

(294, 13)
(294,)


In [54]:
x = df[features]
y = df['num']
print(x.shape)
print(y.shape)

(294, 13)
(294,)




#### Scaling the data

In [55]:

x = preprocessing.scale(x)

In [56]:
x

array([[-1.00359234,  0.61666984, -1.02025916, ..., -0.73929521,
        -0.11089228, -0.32235972],
       [ 0.15047343, -1.62161328,  0.01765154, ...,  1.37092607,
        -0.11089228, -0.32235972],
       [-1.38828093,  0.61666984, -1.02025916, ..., -0.73929521,
        -0.11089228, -0.32235972],
       ..., 
       [ 0.0222439 ,  0.61666984,  0.01765154, ..., -0.73929521,
        -0.11089228,  3.14497288],
       [-0.10598563, -1.62161328, -1.02025916, ...,  1.17908777,
        -0.11089228, -0.32235972],
       [ 0.66339155,  0.61666984,  1.05556224, ..., -0.73929521,
        -0.11089228, -0.32235972]])

#### Splitting the data into testing and training

In [57]:
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=5)

#### Training my pca model

In [91]:

my_pca = PCA(n_components=13)
my_pca.fit(x_train)

print(my_pca.explained_variance_ratio_)

[ 0.22910972  0.12209178  0.09912614  0.09583057  0.08519139  0.07705091
  0.07016747  0.06000009  0.04813019  0.03786461  0.03706467  0.02748661
  0.01088585]


In [92]:
k = 4
my_pca = PCA(n_components=k)
my_pca.fit(x_train)
x_train_pca = my_pca.transform(x_train)
x_test_pca = my_pca.transform(x_test)



#### Creating and training svm model

In [93]:
mySVC = SVC(C=1, kernel='rbf', gamma=0.1, random_state=5)
mySVC.fit(x_train_pca, y_train)
yPredictSVC = mySVC.predict(x_test_pca)
score_SVC = accuracy_score(y_test, yPredictSVC)
print(score_SVC)

0.728813559322




#### Creating and training Neural network

In [94]:
my_ANN = MLPClassifier(hidden_layer_sizes=(100,20,), activation= 'logistic', 
                       solver='adam', alpha=1e-5, random_state=5, 
                       learning_rate_init = 0.02)
my_ANN.fit(x_train, y_train)
y_predict_ann = my_ANN.predict(x_test)
score_ann = accuracy_score(y_test, y_predict_ann)
print(score_ann)

0.728813559322
