In [1]:

#   ____        _           ____ _                  _             
#  |  _ \  __ _| |_ __ _   / ___| | ___  __ _ _ __ (_)_ __   __ _ 
#  | | | |/ _` | __/ _` | | |   | |/ _ \/ _` | '_ \| | '_ \ / _` |
#  | |_| | (_| | || (_| | | |___| |  __/ (_| | | | | | | | | (_| |
#  |____/ \__,_|\__\__,_|  \____|_|\___|\__,_|_| |_|_|_| |_|\__, |
#                                                           |___/ 

# AUTHORS : Austin Li, George Melek, Gianna Galard
# CSC412 PROFESSOR IMBERMAN
# DATE : 11/23/2021
import pandas as pd 
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# load the heart data
#df = pd.read_csv('\\Users\\gianna\\Desktop\\Data-Cleaning\\heartdata.csv') 
df = pd.read_csv('/Users/george/Desktop/CSC-412-Group-Project/Data-Cleaning/heartdata.csv')

In [3]:
# print the first 5 rows of the data set
df.head()

Unnamed: 0,age,sex,cp,restbp,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,hd
0,63,1,1,145,233,1,2,150,0,2.3,3,0,6,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,3,2
2,67,1,4,120,229,0,2,129,1,2.6,2,2,7,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0,3,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,3,0


In [4]:
# determine the datatype of each column
df.dtypes

age          int64
sex          int64
cp           int64
restbp       int64
chol         int64
fbs          int64
restecg      int64
thalach      int64
exang        int64
oldpeak    float64
slope        int64
ca          object
thal        object
hd           int64
dtype: object

In [5]:
# print the unique values for the columns **ca** and **thal**
print(df.ca.unique()) 
print(df.thal.unique())

['0' '3' '2' '1' '?']
['6' '3' '7' '?']


In [6]:
# determine how many rows contain missing values in the columns **ca** and **thal**
print(df.ca.isnull().sum()) 
print(df.thal.isnull().sum()) 

0
0


In [7]:
# determine how many rows contain missing values, the python code is below
len(df.loc[(df['ca'] == '?') | (df['thal'] == '?')]) 

6

In [8]:
# since only 6 rows have missing values, let's look at them
df.loc[(df['ca'] == '?') | (df['thal'] == '?')]

Unnamed: 0,age,sex,cp,restbp,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,hd
87,53,0,3,128,216,0,2,115,0,0.0,1,0,?,0
166,52,1,3,138,223,0,0,169,0,0.0,1,?,3,0
192,43,1,4,132,247,1,2,143,1,0.1,2,?,7,1
266,52,1,4,128,204,1,0,156,1,1.0,2,0,?,2
287,58,1,2,125,220,0,0,144,0,0.4,2,?,7,0
302,38,1,3,138,175,0,0,173,0,0.0,1,?,3,0


In [9]:
# count the number of rows in the full dataset
len(df)

303

In [10]:
# remove the rows with missing values
df = df.dropna() # dropna() is a method of the pandas dataframe that removes all rows with missing values

In [11]:
# verify that the rows with missing values have been removed
len(df) # the length of the dataframe should be reduced by 6 rows

303

In [12]:
# verify using the unique function that "ca" and "thal" do not have missing values
print(df.ca.unique())
print(df.thal.unique())

['0' '3' '2' '1' '?']
['6' '3' '7' '?']


In [13]:
# split the data into dependent and independent variables
# the column of data that we will to to make classifications
X = df.iloc[:,:-1] # this line of code is the same as X = df.drop(['ca'], axis=1)
# the column of data that we want to predict
y = df.iloc[:,-1] # this line of code is the same as y = df['ca']

In [17]:
# print the head of both the X and y dataframes so that you can verify this worked correctly
X.head() # this is the independent variables

Unnamed: 0,age,sex,cp,restbp,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,1,145,233,1,2,150,0,2.3,3,0,6
1,67,1,4,160,286,0,2,108,1,1.5,2,3,3
2,67,1,4,120,229,0,2,129,1,2.6,2,2,7
3,37,1,3,130,250,0,0,187,0,3.5,3,0,3
4,41,0,2,130,204,0,2,172,0,1.4,1,0,3


In [18]:
y.head() # this is the dependent variables

0    0
1    1
2    1
3    0
4    0
Name: hd, dtype: int64

In [19]:
X_encoded = pd.get_dummies(X, columns=['cp',
 'restecg',
'slope',
 'thal'])

In [20]:
X_encoded.head()

Unnamed: 0,age,sex,restbp,chol,fbs,thalach,exang,oldpeak,ca,cp_1,...,restecg_0,restecg_1,restecg_2,slope_1,slope_2,slope_3,thal_3,thal_6,thal_7,thal_?
0,63,1,145,233,1,150,0,2.3,0,1,...,0,0,1,0,0,1,0,1,0,0
1,67,1,160,286,0,108,1,1.5,3,0,...,0,0,1,0,1,0,1,0,0,0
2,67,1,120,229,0,129,1,2.6,2,0,...,0,0,1,0,1,0,0,0,1,0
3,37,1,130,250,0,187,0,3.5,0,0,...,1,0,0,0,0,1,1,0,0,0
4,41,0,130,204,0,172,0,1.4,0,0,...,0,0,1,1,0,0,1,0,0,0


In [21]:
y.unique() # this should return the unique values in the dependent variable

array([0, 1])

In [22]:
y_not_zero_idx = y > 0 # this is a boolean array that is true if the dependent variable is greater than 0
y[y_not_zero_idx] = 1 # this changes the dependent variable to 1 if it is greater than 0
y.unique()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y[y_not_zero_idx] = 1 # this changes the dependent variable to 1 if it is greater than 0


array([0, 1])

In [23]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=0) 

In [24]:
# use this data to run a decision tree on the data
tree = DecisionTreeClassifier(max_depth=None, random_state=0) # create a decision tree classifier
tree.fit(X_train, y_train) # fit the data to the tree
y_pred = tree.predict(X_test) # this is the actual prediction

ValueError: could not convert string to float: '?'

In [25]:
# print how many levels of the tree were created and the accuracy of the model
print(tree.tree_.max_depth) # this is the number of levels in the tree
print(accuracy_score(y_test, y_pred)) # this is the accuracy of the model

AttributeError: 'DecisionTreeClassifier' object has no attribute 'tree_'