In [61]:
import pandas as pd 
import numpy as np 
def RMSE(x_pred,x_true):
   return np.sqrt(np.mean((x_true - x_pred)**2))
def confusion_matrix(y_true, y_pred):
    classes = np.unique(y_true)
    n_classes = len(classes)
    conf_matrix = np.zeros((n_classes, n_classes), dtype=int)

    for i in range(len(y_true)):
        true_class = y_true[i]
        pred_class = y_pred[i]
        conf_matrix[true_class][pred_class] += 1
        
    return conf_matrix
def accuracy(y_pred, y_true):
    return np.mean(y_true == y_pred)

""""Epss : Large number -> abnormal
Lvdd: Large hearts tend to be sick hearts
Wall_Motion_index: should be around 12-13 """

col_names = ['Months_Survived','Still_Alive','Age_o_HA',
'Effusion','Fractional_Short','Epss',
'Lvdd','Wall_MotionSc','Wallmotion_index','Mult',
'Name','Group','Alive_at_1']
p = pd.read_csv('/home/dimitriskana/workspace/Echocardiogram/echocardiogram.data',  on_bad_lines= 'skip', names = col_names)
Data = p.drop(['Wall_MotionSc','Name','Group','Mult'],axis=1)
Data[Data =='?' ] = np.nan
Data = Data.astype(float)

Here we need to clear the data. 
The data contains lot of ? values that I translated to nans for better use. 
I cleaned the data using the Still_Alive and Months_Survived columns,
while using hints from the researchers,in order our target to be nan-free

In [48]:

id = Data[(Data['Months_Survived']<12) & (Data['Alive_at_1']!=1)]
#clear the nans for patients that survived less than a year, 
# in the target column I assign the still_alive value
Data.loc[id.index,'Alive_at_1'] = Data.iloc[id.index]['Still_Alive']
Data.loc[Data['Months_Survived']>12,'Alive_at_1'] = 1
#check if patients that are still alive after 12 months,
#  are labeled as alive
Data.loc[(Data['Months_Survived']>12)&Data['Still_Alive']==1,'Alive_at_1'] = 1
# we create a dataframe without patients that are alive less than a year
df = Data[~((Data['Months_Survived']<12) & (Data['Alive_at_1']==1))].reset_index().drop('index',axis =1)
#we also want to get rid some more nans for our target,
#  we assign the value of  the Still_Alive column to our target   
i = df[df['Alive_at_1'].isna()].index
df.loc[i,'Alive_at_1'] = df.loc[i,'Still_Alive']




Still we have missing values in crucial attributes of a patient. 
We will try to solve that using a variety of techniques such as : 
1. dropping all the rows that contain nans
2. calculate the values using methods I found in a paper and comparing it to an open-source library
3. using regression to calculate the missing values
After that we will see what worked better and decide

In [44]:
#Map of the missing values
df.isna().sum()

Months_Survived     1
Still_Alive         0
Age_o_HA            3
Effusion            0
Fractional_Short    3
Epss                9
Lvdd                4
Wallmotion_index    0
Alive_at_1          0
dtype: int64

In [62]:
#droping the still alive as it is incoporated to the alive at 1 and the months_survived
# we use standardization for the X data only for the non categorical values 
Drop_Data = df.dropna().drop('Still_Alive',axis=1)
X = (Drop_Data.copy()*Drop_Data.mean()/Drop_Data.std()).to_numpy()
#target
y =Drop_Data['Alive_at_1'].to_numpy().astype('int64')

In [63]:
from sklearn.model_selection import train_test_split
from tree import DecisionTree as DT 
from forrest import R_Forrest as R_F
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=1234)
D = DT(min_samples_split=2, max_depth=200)
D.fit(X_train,y_train)
D.predict(X_test)

array([1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1])

In [64]:
CM = confusion_matrix(D.predict(X_test),y_test)
print(CM)
F = RMSE(D.predict(X_test),y_test)
print(F)

[[ 3  0]
 [ 0 25]]
0.0


In [65]:
accuracy(D.predict(X_test),y_test)

1.0

conf_matrix: [[TN,FP
               FN,TP]]

In [66]:
Forest = R_F(min_samples_split= 2, max_depth=200,n_trees=2)
Forest.fit(X_train,y_train)
XX = Forest.predict(X_test)
print(RMSE(XX,y_test))
print(accuracy(XX,y_test))
#print(confusion_matrix(XX,y_test))

0.2672612419124244
0.9285714285714286


Now we replace the nans with their respective median  

In [58]:
Mead_Data = df.drop('Still_Alive',axis=1)
Mead_Data.fillna(Mead_Data.median(), inplace=True)
Xm = (Mead_Data*Mead_Data.mean()/Mead_Data.std()).to_numpy()
#target
ym =Mead_Data['Alive_at_1'].to_numpy().astype('int64')

X_trainm, X_testm, y_trainm, y_testm = train_test_split(X, y,
 test_size = 0.33, random_state=1234)
D.fit(X_trainm,y_trainm)
D.predict(X_testm)


array([1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [67]:
CMm = confusion_matrix(D.predict(X_testm),y_testm)
print(CMm)
Fm = RMSE(D.predict(X_testm),y_testm)
print(f'The RMSE is :  {Fm :.4f}')
print(f'The accuracy of the model is : {accuracy(D.predict(X_testm),y_testm) :.4f}')

[[ 2  0]
 [ 0 31]]
The RMSE is :  0.0000
The accuracy of the model is : 1.0000


In [45]:
Forest.fit(X_trainm,y_trainm)
XXm = Forest.predict(X_testm)
print(RMSE(XXm,y_testm))

0.17407765595569785
