In [1]:
import pandas as pd 
import numpy as np 
def RMSE(x_pred,x_true):
   return np.sqrt(np.mean((x_true - x_pred)**2))
def confusion_matrix(y_pred, y_true):
    classes = np.unique(y_true)
    n_classes = len(classes)
    conf_matrix = np.zeros((n_classes, n_classes), dtype=int)

    for i in range(len(y_true)):
        true_class = y_true[i]
        pred_class = y_pred[i]
        conf_matrix[true_class][pred_class] += 1
        
    return conf_matrix
def accuracy(y_pred, y_true):
    return np.mean(y_true == y_pred)

""""Epss : Large number -> abnormal
Lvdd: Large hearts tend to be sick hearts
Wall_Motion_index: should be around 12-13 """

col_names = ['Months_Survived','Still_Alive','Age_o_HA',
'Effusion','Fractional_Short','Epss',
'Lvdd','Wall_MotionSc','Wallmotion_index','Mult',
'Name','Group','Alive_at_1']
p = pd.read_csv('/home/dimitriskana/workspace/Echocardiogram/echocardiogram.data',  on_bad_lines= 'skip', names = col_names)
Data = p.drop(['Wall_MotionSc','Name','Group','Mult'],axis=1)
Data[Data =='?' ] = np.nan
Data = Data.astype(float)

Here we need to clear the data. 
The data contains lot of ? values that I translated to nans for better use. 
I cleaned the data using the Still_Alive and Months_Survived columns,
while using hints from the researchers,in order our target to be nan-free

In [2]:

id = Data[(Data['Months_Survived']<12) & (Data['Alive_at_1']!=1)]
#clear the nans for patients that survived less than a year, 
# in the target column I assign the still_alive value
Data.loc[id.index,'Alive_at_1'] = Data.iloc[id.index]['Still_Alive']
Data.loc[Data['Months_Survived']>12,'Alive_at_1'] = 1
#check if patients that are still alive after 12 months,
#  are labeled as alive
Data.loc[(Data['Months_Survived']>12)&Data['Still_Alive']==1,'Alive_at_1'] = 1
# we create a dataframe without patients that are alive less than a year
df = Data[~((Data['Months_Survived']<12) & (Data['Alive_at_1']==1))].reset_index().drop('index',axis =1)
#we also want to get rid some more nans for our target,
#  we assign the value of  the Still_Alive column to our target   
i = df[df['Alive_at_1'].isna()].index
df.loc[i,'Alive_at_1'] = df.loc[i,'Still_Alive']




Still we have missing values in crucial attributes of a patient. 
We will try to solve that using a variety of techniques such as : 
1. dropping all the rows that contain nans
2. calculate the values using methods I found in a paper and comparing it to an open-source library
3. using regression to calculate the missing values
After that we will see what worked better and decide

In [4]:
#Map of the missing values
df.isna().sum()

Months_Survived     1
Still_Alive         0
Age_o_HA            3
Effusion            0
Fractional_Short    3
Epss                9
Lvdd                4
Wallmotion_index    0
Alive_at_1          0
dtype: int64

In [3]:
#droping the still alive as it is incoporated to the alive at 1 and the months_survived
# we use standardization for the X data only for the non categorical values 
Drop_Data = df.dropna().drop('Still_Alive',axis=1)
X = (Drop_Data.copy()*Drop_Data.mean()/Drop_Data.std()).to_numpy()
#target
y =Drop_Data['Alive_at_1'].to_numpy().astype('int64')

In [4]:
from sklearn.model_selection import train_test_split
from tree import DecisionTree as DT 
from forrest import R_Forrest as R_F
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=1234)
D = DT(min_samples_split=2, max_depth=200)
D.fit(X_train,y_train)
D.predict(X_test)

array([1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 0, 0, 1])

In [5]:
CM = confusion_matrix(D.predict(X_test),y_test)
print(CM)
F = RMSE(D.predict(X_test),y_test)
print(F)

[[ 3  0]
 [ 3 22]]
0.32732683535398854


In [6]:
accuracy(D.predict(X_test),y_test)

0.8928571428571429

conf_matrix: [[TN,FP
               FN,TP]]

In [7]:
Forest = R_F(min_samples_split=2, max_depth=200,n_trees=2)
Forest.fit(X_train,y_train)
XX = Forest.predict(X_test)
print(RMSE(XX,y_test))
print(accuracy(XX,y_test))


0.0
1.0


Now we replace the nans with their respective median  

In [8]:
Mead_Data = df.drop('Still_Alive',axis=1)
Mead_Data.fillna(Mead_Data.median(), inplace=True)
Xm = (Mead_Data*Mead_Data.mean()/Mead_Data.std()).to_numpy()
#target
ym =Mead_Data['Alive_at_1'].to_numpy().astype('int64')

X_trainm, X_testm, y_trainm, y_testm = train_test_split(X, y,
 test_size = 0.33, random_state=1234)
D.fit(X_trainm,y_trainm)
D.predict(X_testm)


array([1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1])

In [21]:
CMm = confusion_matrix(D.predict(X_testm),y_testm)
print(CMm)
Fm = RMSE(D.predict(X_testm),y_testm)
print(f'The RMSE is :  {Fm :.4f}')
print(f'The accuracy of the model is : {accuracy(D.predict(X_testm),y_testm) :.4f}')

[[ 3  0]
 [ 0 25]]
The RMSE is :  0.0000
The accuracy of the model is : 1.0000


In [10]:
Forest.fit(X_trainm,y_trainm)
XXm = Forest.predict(X_testm)
#print(f'The confusion matrix : {confusion_matrix(XXm,y_testm) :.4f}')
print(f'The RMSE is :  {RMSE(XXm,y_testm) :.4f}')
print(f'The accuracy of the model is : {accuracy(XXm,y_testm) :.4f}')

The RMSE is :  0.2673
The accuracy of the model is : 0.9286


In [19]:
from fancyimpute import KNN, NuclearNormMinimization, SoftImpute
Imputers = [SoftImpute(),KNN(),NuclearNormMinimization()]
rm = []
accs = [ ]
for item in Imputers : 
    new_df = pd.DataFrame()
    new_df = pd.DataFrame(item.fit_transform(df), columns = df.columns)
    
    new_df = new_df.drop('Still_Alive',axis=1)
    
    #norm
    X = (new_df*new_df.mean()/new_df.std()).to_numpy()
    #target
    y =new_df['Alive_at_1'].to_numpy().astype('int64')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=1234)
    D.fit(X_train,y_train)
    Pr = D.predict(X_test)
    rm.append(RMSE(Pr,y_test))
    accs.append(accuracy(Pr,y_test))
    print(f'The confusion matrix is: {confusion_matrix(Pr,y_test)}')

    
    


[SoftImpute] Max Singular Value of X_init = 674.373236
[SoftImpute] Iter 1: observed MAE=0.553216 rank=3
[SoftImpute] Iter 2: observed MAE=0.549412 rank=3
[SoftImpute] Iter 3: observed MAE=0.550459 rank=3
[SoftImpute] Iter 4: observed MAE=0.551597 rank=3
[SoftImpute] Iter 5: observed MAE=0.552465 rank=3
[SoftImpute] Iter 6: observed MAE=0.553052 rank=3
[SoftImpute] Iter 7: observed MAE=0.553442 rank=3
[SoftImpute] Iter 8: observed MAE=0.553706 rank=3
[SoftImpute] Iter 9: observed MAE=0.553875 rank=3
[SoftImpute] Iter 10: observed MAE=0.553940 rank=3
[SoftImpute] Iter 11: observed MAE=0.553991 rank=3
[SoftImpute] Iter 12: observed MAE=0.553986 rank=3
[SoftImpute] Iter 13: observed MAE=0.553942 rank=3
[SoftImpute] Iter 14: observed MAE=0.553876 rank=3
[SoftImpute] Iter 15: observed MAE=0.553786 rank=3
[SoftImpute] Iter 16: observed MAE=0.553676 rank=3
[SoftImpute] Iter 17: observed MAE=0.553555 rank=3
[SoftImpute] Iter 18: observed MAE=0.553430 rank=3
[SoftImpute] Iter 19: observed MAE=0

In [20]:
accs


[1.0, 0.9696969696969697, 1.0]

Overall, We could conclude that if we repalce nans with the median works as fine as more sophisticated imputation techniques we used with fancyimpute.
The decision tree seems to be working very well for this categorization of the dataset. 