In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

In [2]:
datapath = '../data/raw/'
train_file = datapath+'train_values.csv'
target_file = datapath+'train_labels.csv'
test_file = datapath+'test_values.csv'

train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)
target_df = pd.read_csv(target_file)

In [3]:
print(train_df.shape)
train_df.head()


(180, 14)


Unnamed: 0,patient_id,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina
0,0z64un,1,normal,128,2,0,0,2,308,0.0,1,45,170,0
1,ryoo3j,2,normal,110,3,0,0,0,214,1.6,0,54,158,0
2,yt1s1x,1,normal,125,4,3,0,2,304,0.0,1,77,162,1
3,l2xjde,1,reversible_defect,152,4,0,0,0,223,0.0,1,40,181,0
4,oyt4ek,3,reversible_defect,178,1,0,0,2,270,4.2,1,59,145,0


In [4]:
print(test_df.shape)
test_df.head()

(90, 14)


Unnamed: 0,patient_id,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina
0,olalu7,2,reversible_defect,170,1,0,0,2,288,0.2,1,59,159,0
1,z9n6mx,1,normal,138,4,0,0,0,183,1.4,0,35,182,0
2,5k4413,2,reversible_defect,120,4,0,0,2,177,2.5,1,43,120,1
3,mrg7q5,1,normal,102,3,1,0,0,318,0.0,0,60,160,0
4,uki4do,2,normal,138,4,1,0,2,166,3.6,1,61,125,1


In [5]:
print(target_df.shape)
target_df.head()

(180, 2)


Unnamed: 0,patient_id,heart_disease_present
0,0z64un,0
1,ryoo3j,0
2,yt1s1x,1
3,l2xjde,1
4,oyt4ek,0


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 14 columns):
patient_id                              180 non-null object
slope_of_peak_exercise_st_segment       180 non-null int64
thal                                    180 non-null object
resting_blood_pressure                  180 non-null int64
chest_pain_type                         180 non-null int64
num_major_vessels                       180 non-null int64
fasting_blood_sugar_gt_120_mg_per_dl    180 non-null int64
resting_ekg_results                     180 non-null int64
serum_cholesterol_mg_per_dl             180 non-null int64
oldpeak_eq_st_depression                180 non-null float64
sex                                     180 non-null int64
age                                     180 non-null int64
max_heart_rate_achieved                 180 non-null int64
exercise_induced_angina                 180 non-null int64
dtypes: float64(1), int64(11), object(2)
memory usage: 19.8+ KB


In [7]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 14 columns):
patient_id                              90 non-null object
slope_of_peak_exercise_st_segment       90 non-null int64
thal                                    90 non-null object
resting_blood_pressure                  90 non-null int64
chest_pain_type                         90 non-null int64
num_major_vessels                       90 non-null int64
fasting_blood_sugar_gt_120_mg_per_dl    90 non-null int64
resting_ekg_results                     90 non-null int64
serum_cholesterol_mg_per_dl             90 non-null int64
oldpeak_eq_st_depression                90 non-null float64
sex                                     90 non-null int64
age                                     90 non-null int64
max_heart_rate_achieved                 90 non-null int64
exercise_induced_angina                 90 non-null int64
dtypes: float64(1), int64(11), object(2)
memory usage: 9.9+ KB


In [8]:
thals = {'normal' : 0 , 'fixed_defect' : 1 , 'reversible_defect' : 2}
data = [train_df, test_df]
for dataset in data:
    dataset['thal'] = dataset['thal'].map(thals)




In [9]:
train = train_df.drop(['patient_id'], axis=1)
test = test_df.drop(['patient_id'], axis=1)
target = target_df.drop(['patient_id'], axis=1)

In [10]:
print(train.shape)
print(test.shape)
print(target.shape)

(180, 13)
(90, 13)
(180, 1)


In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 13 columns):
slope_of_peak_exercise_st_segment       180 non-null int64
thal                                    180 non-null int64
resting_blood_pressure                  180 non-null int64
chest_pain_type                         180 non-null int64
num_major_vessels                       180 non-null int64
fasting_blood_sugar_gt_120_mg_per_dl    180 non-null int64
resting_ekg_results                     180 non-null int64
serum_cholesterol_mg_per_dl             180 non-null int64
oldpeak_eq_st_depression                180 non-null float64
sex                                     180 non-null int64
age                                     180 non-null int64
max_heart_rate_achieved                 180 non-null int64
exercise_induced_angina                 180 non-null int64
dtypes: float64(1), int64(12)
memory usage: 18.4 KB


In [12]:
target.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 1 columns):
heart_disease_present    180 non-null int64
dtypes: int64(1)
memory usage: 1.5 KB


In [13]:
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.3, random_state=41)

In [14]:
X_train.shape

(126, 13)

In [34]:
random_forest = RandomForestClassifier(n_estimators=200)
random_forest.fit(X_train, y_train)

Y_prediction = random_forest.predict(X_test)

random_forest.score(X_train, y_train)
acc_random_forest = round(random_forest.score(X_train, y_train) * 100, 2)
print(round(acc_random_forest,2,), "%")

  from ipykernel import kernelapp as app


(100.0, '%')


In [35]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

Y_pred = logreg.predict(X_test)

acc_log = round(logreg.score(X_train, y_train) * 100, 2)
print(round(acc_log,2,), "%")

(86.51, '%')


In [36]:
Y_pred_test = random_forest.predict_proba(test)
print(Y_pred_test[:,0])

[ 0.58   0.865  0.04   0.81   0.165  0.99   0.55   0.22   0.72   0.815
  0.575  0.515  0.685  0.32   0.73   0.985  0.91   0.67   0.14   0.93   0.02
  0.95   0.81   0.87   0.4    0.07   0.465  0.845  0.645  1.     0.245
  0.635  0.425  0.49   0.875  0.945  0.765  0.68   0.71   0.66   0.265
  0.765  0.055  0.78   0.09   0.945  0.975  0.79   0.7    0.475  0.365
  0.995  0.225  0.755  0.73   0.905  0.22   0.905  0.855  0.205  0.91   0.09
  0.82   0.17   0.745  0.305  0.42   0.545  0.365  0.365  0.865  0.005
  0.035  0.065  0.2    0.     0.165  0.375  0.66   0.445  0.3    0.895
  0.55   0.35   0.79   0.845  0.13   0.65   0.895  0.82 ]


In [32]:
submission = pd.DataFrame({
        "patient_id": test_df["patient_id"],
        "heart_disease_present": Y_pred_test[:,0]
    })
submission = submission[['patient_id', 'heart_disease_present']] 
submission.head()

Unnamed: 0,patient_id,heart_disease_present
0,olalu7,0.6
1,z9n6mx,0.84
2,5k4413,0.05
3,mrg7q5,0.84
4,uki4do,0.17


In [33]:
submission.to_csv('submission.csv', index=False)

In [40]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(n_estimators=100)
ada.fit(X_train, y_train)

Y_prediction = ada.predict(X_test)

ada.score(X_train, y_train)
acc_ada = round(ada.score(X_train, y_train) * 100, 2)
print(round(acc_ada,2,), "%")

(100.0, '%')


In [41]:
Y_pred_test_ada = ada.predict_proba(test)
print(Y_pred_test_ada[:,0])
submission = pd.DataFrame({
        "patient_id": test_df["patient_id"],
        "heart_disease_present": Y_pred_test[:,0]
    })
submission = submission[['patient_id', 'heart_disease_present']] 
submission.head()
submission.to_csv('submission4.csv', index=False)

[ 0.48120414  0.53620003  0.43503052  0.58557732  0.46265013  0.53869917
  0.46898194  0.58204671  0.51820056  0.49438602  0.49682973  0.48149036
  0.51754562  0.53724026  0.50934242  0.53894141  0.63120116  0.48832035
  0.49487476  0.5136258   0.45939726  0.5486099   0.51083484  0.53533169
  0.49634938  0.44679027  0.51410344  0.52708351  0.52048405  0.53869917
  0.45514911  0.50938492  0.47958649  0.4939034   0.52219278  0.52674925
  0.54457587  0.60538747  0.57627509  0.50802713  0.56475259  0.53068148
  0.42672088  0.50780016  0.47772564  0.51162838  0.54236335  0.5873599
  0.50981599  0.49916469  0.58380993  0.52727039  0.49119741  0.51062892
  0.51968938  0.53426139  0.50289441  0.51727072  0.53142947  0.49336919
  0.53060695  0.45393419  0.51443867  0.43484894  0.50845594  0.42960399
  0.50101124  0.56371646  0.51902848  0.4538879   0.57154876  0.43998869
  0.42129778  0.45874298  0.47120687  0.45340594  0.4606527   0.43340935
  0.50808583  0.57891995  0.49555889  0.60852509  0.

NameError: name 'GradBoostingClassifier' is not defined