In [1]:
import pandas as pd
import numpy as np

In [2]:
column_names = pd.read_csv("echocardiogram.names", header=None)[0].tolist()
len(column_names)

13

In [3]:
column_names = pd.read_csv("echocardiogram.names", header=None)[0]
column_names

0                  survival
1               still-alive
2       age-at-heart-attack
3      pericardial-effusion
4     fractional-shortening
5                      epss
6                      lvdd
7         wall-motion-score
8         wall-motion-index
9                      mult
10                     name
11                    group
12               alive-at-1
Name: 0, dtype: object

In [4]:
column_names = pd.read_csv("echocardiogram.names", header=None)[0].tolist()
data = pd.read_csv("echocardiogram.data", names=column_names)
print("Data len:", len(data))
print("Attribute count:", len(data.columns))
print(data.dtypes)
data.head()

Data len: 132
Attribute count: 13
survival                 object
still-alive              object
age-at-heart-attack      object
pericardial-effusion     object
fractional-shortening    object
epss                     object
lvdd                     object
wall-motion-score        object
wall-motion-index        object
mult                     object
name                     object
group                    object
alive-at-1               object
dtype: object


Unnamed: 0,survival,still-alive,age-at-heart-attack,pericardial-effusion,fractional-shortening,epss,lvdd,wall-motion-score,wall-motion-index,mult,name,group,alive-at-1
0,11,0,71,0,0.26,9.0,4.6,14,1.0,1.0,name,1,0
1,19,0,72,0,0.38,6.0,4.1,14,1.7,0.588,name,1,0
2,16,0,55,0,0.26,4.0,3.42,14,1.0,1.0,name,1,0
3,57,0,60,0,0.253,12.062,4.603,16,1.45,0.788,name,1,0
4,19,1,57,0,0.16,22.0,5.75,18,2.25,0.571,name,1,0


In [5]:
data = data.drop(["wall-motion-score","mult","name","group"],axis = 1)#dropped unnecessary columns

In [6]:
data.head()

Unnamed: 0,survival,still-alive,age-at-heart-attack,pericardial-effusion,fractional-shortening,epss,lvdd,wall-motion-index,alive-at-1
0,11,0,71,0,0.26,9.0,4.6,1.0,0
1,19,0,72,0,0.38,6.0,4.1,1.7,0
2,16,0,55,0,0.26,4.0,3.42,1.0,0
3,57,0,60,0,0.253,12.062,4.603,1.45,0
4,19,1,57,0,0.16,22.0,5.75,2.25,0


In [7]:
data = data[pd.to_numeric(data['survival'], errors='coerce').notnull()] 
# Dropped 2 more lines because survival is non-numeric

In [8]:
len(data)

130

In [9]:
# converted "survival" and "still-alive" columns to numeric values in order to make comparison in the next cell
data = data.astype({"survival": "float", "still-alive": "int"})

In [10]:
# We assign survival-at-1 1 if survival is greater than 12, 0 otherwise
data["alive-at-1"] = np.where(data["survival"] >= 12, 1, 0)

In [11]:
# Dropped those that were both alive and survival value is less than 12.
data = data.drop(data[(data["survival"] < 12) & (data["still-alive"] == 1)].index)

In [12]:
len(data)

96

In [13]:
data.replace("?",np.nan,inplace=True)
data.dropna(inplace=True) # Lastly I dropped all rows that includes any non-values(maybe you can drop some of these by threshold instead of dropping all)
len(data)

82

In [14]:
print("Data len:", len(data))
print("Attribute count:", len(data.columns))
print(data.dtypes)
data.head()

Data len: 82
Attribute count: 9
survival                 float64
still-alive                int32
age-at-heart-attack       object
pericardial-effusion      object
fractional-shortening     object
epss                      object
lvdd                      object
wall-motion-index         object
alive-at-1                 int32
dtype: object


Unnamed: 0,survival,still-alive,age-at-heart-attack,pericardial-effusion,fractional-shortening,epss,lvdd,wall-motion-index,alive-at-1
0,11.0,0,71,0,0.26,9.0,4.6,1.0,0
1,19.0,0,72,0,0.38,6.0,4.1,1.7,1
2,16.0,0,55,0,0.26,4.0,3.42,1.0,1
3,57.0,0,60,0,0.253,12.062,4.603,1.45,1
4,19.0,1,57,0,0.16,22.0,5.75,2.25,1


In [15]:
# Checked if boolean values contains any value except 0 or 1
print(np.where( (data["still-alive"] != 0) & (data["still-alive"] != 1) ))
print(np.where( (data["pericardial-effusion"] != 0) & (data["pericardial-effusion"] != 1) ))
print(np.where( (data["alive-at-1"] != 0) & (data["alive-at-1"] != 1) ))

(array([], dtype=int64),)
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81],
      dtype=int64),)
(array([], dtype=int64),)


In [16]:
# converted types so that there is no trouble during the operations.
data = data.astype({"age-at-heart-attack": "int", "fractional-shortening": "float", "epss":"float", "lvdd":"float","wall-motion-index":"float"})
print(data.dtypes)
data.head()

survival                 float64
still-alive                int32
age-at-heart-attack        int32
pericardial-effusion      object
fractional-shortening    float64
epss                     float64
lvdd                     float64
wall-motion-index        float64
alive-at-1                 int32
dtype: object


Unnamed: 0,survival,still-alive,age-at-heart-attack,pericardial-effusion,fractional-shortening,epss,lvdd,wall-motion-index,alive-at-1
0,11.0,0,71,0,0.26,9.0,4.6,1.0,0
1,19.0,0,72,0,0.38,6.0,4.1,1.7,1
2,16.0,0,55,0,0.26,4.0,3.42,1.0,1
3,57.0,0,60,0,0.253,12.062,4.603,1.45,1
4,19.0,1,57,0,0.16,22.0,5.75,2.25,1


In [17]:
x = data.drop(["alive-at-1"], axis=1).values

In [18]:
y = data[["alive-at-1"]].values

### Random Forest Classifier 

In [19]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

In [20]:
#train-test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.30, random_state=42)

#standardization processes
from sklearn.preprocessing import StandardScaler 
sc = StandardScaler()
X_train = sc.fit_transform(x_train)
X_test = sc.transform(x_test)

rfc = RandomForestClassifier(random_state = 42)
rfc.fit(X_train, y_train.ravel())
y_pred = rfc.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix")
print(cm)

#Classification report
print("Classification Report (Testing)")
target_names = ["not survives", "survives"]
print(classification_report(y_test, y_pred, target_names= target_names))

Confusion Matrix
[[ 0  2]
 [ 0 23]]
Classification Report (Testing)
              precision    recall  f1-score   support

not survives       0.00      0.00      0.00         2
    survives       0.92      1.00      0.96        23

    accuracy                           0.92        25
   macro avg       0.46      0.50      0.48        25
weighted avg       0.85      0.92      0.88        25



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
# Cross validation to get rid of data split bias. (overfitting, underfitting)
X_train_val, X_test, y_train_val, y_test = train_test_split(x, y, test_size=(0.30), 
                                                            random_state = 42, shuffle=True, stratify = y)

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=(0.30), 
                                                  random_state = 42, shuffle=True, stratify = y_train_val)

X_train = sc.fit_transform(X_train)
X_val = sc.transform(X_val)# standartization

rfc = RandomForestClassifier(random_state=42)

parameters = {"n_estimators":[1, 10, 100], "max_depth":[None,2], "criterion":["gini", "entropy"]}
#scoring = ['accuracy', 'precision', 'recall', 'f1']
scoring = ['f1_weighted']
grid1 = GridSearchCV(estimator = rfc, param_grid = parameters, cv=4, 
                     n_jobs=-1, verbose=10, refit='f1_weighted', scoring = scoring)

grid1 = grid1.fit(X_train_val, y_train_val.ravel())
print(grid1)

# the results of the grid search (model selection according to refit metric) 
#(but evaluation will applied for all scoring metrics provided)
print("Best Score: ",grid1.best_score_)
print("Best Estimator: ", grid1.best_estimator_)
print("Best Parameters: ", grid1.best_params_)
print("Tuning cv results: ")
pd.set_option('display.max_colwidth', None)
display(pd.DataFrame(data=grid1.cv_results_, columns=grid1.cv_results_.keys()))
print("\n")

#Classification report
y_pred = grid1.best_estimator_.predict(X_test)
print("Classification Report (Testing)")
target_names = ["not survives", "survives"]
print(classification_report(y_test, y_pred, target_names= target_names))

#Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
file = open("Confusion_Matrix.txt","w")
file.write("Confusion Matrix\n")
file.write("{}{}{}{}{}\n{}{}{}{}{}".format("[[",cm[0,0],",",cm[0,1],"]"," [",cm[1,0],",",cm[1,1],"]]"))
file.close()

#np.savetxt('test.txt', cm, delimiter=',')

Fitting 4 folds for each of 12 candidates, totalling 48 fits




GridSearchCV(cv=4, estimator=RandomForestClassifier(random_state=42), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [None, 2], 'n_estimators': [1, 10, 100]},
             refit='f1_weighted', scoring=['f1_weighted'], verbose=10)
Best Score:  0.9488323298668127
Best Estimator:  RandomForestClassifier(n_estimators=10, random_state=42)
Best Parameters:  {'criterion': 'gini', 'max_depth': None, 'n_estimators': 10}
Tuning cv results: 


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_n_estimators,params,split0_test_f1_weighted,split1_test_f1_weighted,split2_test_f1_weighted,split3_test_f1_weighted,mean_test_f1_weighted,std_test_f1_weighted,rank_test_f1_weighted
0,0.003241,0.000432,0.001745,0.0004317425,gini,,1,"{'criterion': 'gini', 'max_depth': None, 'n_estimators': 1}",0.901149,0.962963,0.962963,0.89418,0.930314,0.032742,9
1,0.016705,0.001296,0.002992,1.976862e-07,gini,,10,"{'criterion': 'gini', 'max_depth': None, 'n_estimators': 10}",0.901149,1.0,1.0,0.89418,0.948832,0.051227,1
2,0.15309,0.007994,0.013713,0.001475004,gini,,100,"{'criterion': 'gini', 'max_depth': None, 'n_estimators': 100}",0.901149,1.0,1.0,0.89418,0.948832,0.051227,1
3,0.003491,0.000864,0.001995,0.0007051168,gini,2.0,1,"{'criterion': 'gini', 'max_depth': 2, 'n_estimators': 1}",0.901149,0.962963,0.962963,0.89418,0.930314,0.032742,9
4,0.016705,0.001915,0.00374,0.001295583,gini,2.0,10,"{'criterion': 'gini', 'max_depth': 2, 'n_estimators': 10}",0.901149,1.0,1.0,0.89418,0.948832,0.051227,1
5,0.143616,0.003308,0.013713,0.001295193,gini,2.0,100,"{'criterion': 'gini', 'max_depth': 2, 'n_estimators': 100}",0.901149,1.0,1.0,0.89418,0.948832,0.051227,1
6,0.002493,0.000499,0.001496,0.0004987121,entropy,,1,"{'criterion': 'entropy', 'max_depth': None, 'n_estimators': 1}",0.901149,0.962963,0.962963,0.89418,0.930314,0.032742,9
7,0.016705,0.001781,0.002992,0.0007052854,entropy,,10,"{'criterion': 'entropy', 'max_depth': None, 'n_estimators': 10}",0.901149,1.0,1.0,0.89418,0.948832,0.051227,1
8,0.144364,0.004013,0.010971,0.00186601,entropy,,100,"{'criterion': 'entropy', 'max_depth': None, 'n_estimators': 100}",0.901149,1.0,1.0,0.89418,0.948832,0.051227,1
9,0.00374,0.001915,0.001745,0.00082685,entropy,2.0,1,"{'criterion': 'entropy', 'max_depth': 2, 'n_estimators': 1}",0.901149,0.962963,0.962963,0.89418,0.930314,0.032742,9




Classification Report (Testing)
              precision    recall  f1-score   support

not survives       0.00      0.00      0.00         1
    survives       0.96      1.00      0.98        24

    accuracy                           0.96        25
   macro avg       0.48      0.50      0.49        25
weighted avg       0.92      0.96      0.94        25



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
