In [172]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer

In [173]:
Data = pd.read_csv('TD_HOSPITAL_TRAIN.csv')

In [174]:
data=Data.iloc[:,32:-1]
data.shape

(7058, 11)

In [175]:
data[:5]

Unnamed: 0,administratorcost,urine,diabetes,income,extraprimary,bloodchem6,education,psych5,psych6,information,cancer
0,3525.0,5360.0,0.0,$11-$25k,COPD/CHF/Cirrhosis,167.5,20.0,30.0,2.0,0.0,no
1,43200.0,2570.0,0.0,>$50k,Cancer,480.0,16.0,11.5,1.0,10.0,metastatic
2,5894.0,1690.0,1.0,under $11k,ARF/MOSF,177.125,5.0,18.0,0.0,5.0,yes
3,16717.0,,0.0,$11-$25k,COPD/CHF/Cirrhosis,,12.0,7.0,1.839,12.0,no
4,10151.0,,0.0,under $11k,COPD/CHF/Cirrhosis,233.3125,2.0,7.0,6.0,12.0,no


In [176]:
null_counts = data.isnull().sum()
print(null_counts)

administratorcost     139
urine                3794
diabetes                0
income               2275
extraprimary            0
bloodchem6           1793
education            1219
psych5                 60
psych6                  0
information            24
cancer                  0
dtype: int64


In [178]:
def one_hot_encode_feature(dataframe, feature_name):
    """
    One-hot encodes a specified feature from a DataFrame.

    Parameters:
    - dataframe: The input DataFrame.
    - feature_name: The name of the feature to be one-hot encoded.

    Returns:
    - one_hot_df: A DataFrame containing the one-hot encoded feature.
    """

    # Select the specified feature from the DataFrame
    feature_to_encode = dataframe[feature_name]

    # Reshape the feature to have a 2D shape, required by OneHotEncoder
    feature_to_encode = feature_to_encode.values.reshape(-1, 1)

    # Create an instance of the OneHotEncoder
    encoder = OneHotEncoder(sparse=False)  # You can set sparse=True if you want a sparse matrix

    # Fit the encoder to the feature data
    encoder.fit(feature_to_encode)

    # Transform the feature data to one-hot encoded format
    one_hot_encoded = encoder.transform(feature_to_encode)

    # Convert the one-hot encoded data to a DataFrame for better visualization
    one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out([feature_name]))
    
    
    new_dataframe = dataframe.drop(columns=[feature_name])  # Remove the target column
    new_dataframe[one_hot_df.columns] = one_hot_df  # Add the source columns to the target DataFrame
    return new_dataframe 

In [167]:
def clean_fill_mean(df, feature):
    mean_value = df[feature].mean()
    df[feature].fillna(mean_value, inplace=True)
    return df


#replacing the outliers after replacing missing values

def replace_outliers_with_mean(df, column_name, threshold=1.5):
    # Calculate lower and upper bounds for outliers
    Q1 = df[column_name].quantile(0.25)
    Q3 = df[column_name].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - threshold * IQR
    upper_bound = Q3 + threshold * IQR

    # Identify outliers in the specified column
    outliers = df[(df[column_name] < lower_bound) | (df[column_name] > upper_bound)]

    # Replace outliers with the mean of the column
    non_outliers_mean = df[(df[column_name] >= lower_bound) & (df[column_name] <= upper_bound)][column_name].mean()
    df.loc[outliers.index, column_name] = non_outliers_mean

    return df


#replacing missing values with knn imputer
# for totalcost
def replace_missing_with_knn(df, column_name, n_neighbors=5):
    # Create a copy of the DataFrame to avoid modifying the original data
    df_imputed = df.copy()    
    # Extract the column with missing values for imputation
    column_to_impute = df_imputed[[column_name]]   
    # Initialize KNNImputer with the desired number of neighbors
    imputer = KNNImputer(n_neighbors=n_neighbors)   
    # Perform KNN imputation on the specified column
    column_imputed = imputer.fit_transform(column_to_impute)   
    # Replace the missing values in the original DataFrame with imputed values
    df_imputed[column_name] = column_imputed
    return df_imputed

In [168]:
# data = one_hot_encode_feature(data, 'cancer')
# data = one_hot_encode_feature(data, 'extraprimary')

In [169]:
data = data.drop('urine',axis =1)
data = data.drop('income',axis =1)
data = replace_missing_with_knn(data, 'administratorcost')
data = replace_missing_with_knn(data, 'diabetes')
data = replace_missing_with_knn(data, 'bloodchem6')
data = replace_missing_with_knn(data, 'education')
data = replace_missing_with_knn(data, 'psych5')
data = replace_missing_with_knn(data, 'psych6')
data = replace_missing_with_knn(data, 'information')
data = one_hot_encode_feature(data, 'cancer')
data = one_hot_encode_feature(data, 'extraprimary')



In [205]:
df = Data.iloc[:,21:32]
df[:5]

Unnamed: 0,breathing,age,sleep,dnr,bloodchem5,pdeath,meals,pain,primary,psych4,disability
0,34.0,76.56396,7.519531,dnr before sadm,7.359375,,142.0,2.0,Cirrhosis,,<2 mo. follow-up
1,26.0,63.33499,7.479492,no dnr,7.509766,,132.0,3.0,Colon Cancer,0.0,
2,13.0,70.52698,,no dnr,7.459961,,131.0,1.0,ARF/MOSF w/Sepsis,1.0,no(M2 and SIP pres)
3,18.0,55.31799,7.379883,no dnr,,,121.0,1.0,Cirrhosis,2.0,
4,28.0,67.06598,7.429688,no dnr,7.449219,,133.0,4.0,COPD,5.0,no(M2 and SIP pres)


In [206]:
df = df.drop('pdeath', axis=1)
df = df.drop('psych4', axis=1)
df = replace_missing_with_knn(df, 'sleep', n_neighbors=10)
df = replace_missing_with_knn(df, 'bloodchem5', n_neighbors=10)
df = replace_missing_with_mode(df,'disability')
df = one_hot_encode_feature(df, 'dnr')
df = one_hot_encode_feature(df, 'primary')
df = one_hot_encode_feature(df, 'disability')



In [207]:
df[:5]

Unnamed: 0,breathing,age,sleep,bloodchem5,meals,pain,dnr_dnr after sadm,dnr_dnr before sadm,dnr_no dnr,dnr_nan,...,primary_Cirrhosis,primary_Colon Cancer,primary_Coma,primary_Lung Cancer,primary_MOSF w/Malig,disability_<2 mo. follow-up,disability_Coma or Intub,disability_SIP>=30,disability_adl>=4 (>=5 if sur),disability_no(M2 and SIP pres)
0,34.0,76.56396,7.519531,7.359375,142.0,2.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,26.0,63.33499,7.479492,7.509766,132.0,3.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,13.0,70.52698,7.4141,7.459961,131.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,18.0,55.31799,7.379883,7.415443,121.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,28.0,67.06598,7.429688,7.449219,133.0,4.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


<h1>Lab

In [191]:
def replace_missing_with_mode(df, categorical_feature):
    """
    Fill missing values in a categorical feature with the most frequent category.

    Parameters:
    - df: DataFrame containing the data.
    - categorical_feature: Name of the categorical feature/column with missing values.

    Returns:
    - Updated DataFrame with missing values filled in the specified feature.
    """
    # Find the most frequent category in the specified feature
    most_frequent_category = df[categorical_feature].mode()[0]
    
    # Fill missing values in the specified feature with the most frequent category
    df[categorical_feature].fillna(most_frequent_category, inplace=True)
    
    return df

In [192]:
df = Data.iloc[:,21:32]
df[:5]

Unnamed: 0,breathing,age,sleep,dnr,bloodchem5,pdeath,meals,pain,primary,psych4,disability
0,34.0,76.56396,7.519531,dnr before sadm,7.359375,,142.0,2.0,Cirrhosis,,<2 mo. follow-up
1,26.0,63.33499,7.479492,no dnr,7.509766,,132.0,3.0,Colon Cancer,0.0,
2,13.0,70.52698,,no dnr,7.459961,,131.0,1.0,ARF/MOSF w/Sepsis,1.0,no(M2 and SIP pres)
3,18.0,55.31799,7.379883,no dnr,,,121.0,1.0,Cirrhosis,2.0,
4,28.0,67.06598,7.429688,no dnr,7.449219,,133.0,4.0,COPD,5.0,no(M2 and SIP pres)


In [194]:
print(df['disability'].isna().sum())
df = replace_missing_with_mode(df,'disability')
print(df['disability'].isna().sum())

1102
0


In [171]:
data.isna().sum()

administratorcost                  0
diabetes                           0
bloodchem6                         0
education                          0
psych5                             0
psych6                             0
information                        0
cancer_metastatic                  0
cancer_no                          0
cancer_yes                         0
extraprimary_ARF/MOSF              0
extraprimary_COPD/CHF/Cirrhosis    0
extraprimary_Cancer                0
extraprimary_Coma                  0
dtype: int64

In [170]:
data[:5]

Unnamed: 0,administratorcost,diabetes,bloodchem6,education,psych5,psych6,information,cancer_metastatic,cancer_no,cancer_yes,extraprimary_ARF/MOSF,extraprimary_COPD/CHF/Cirrhosis,extraprimary_Cancer,extraprimary_Coma
0,3525.0,0.0,167.5,20.0,30.0,2.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,43200.0,0.0,480.0,16.0,11.5,1.0,10.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,5894.0,1.0,177.125,5.0,18.0,0.0,5.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,16717.0,0.0,241.380204,12.0,7.0,1.839,12.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,10151.0,0.0,233.3125,2.0,7.0,6.0,12.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [152]:
data = replace_missing_with_knn(data, 'pch5')

KeyError: "None of [Index(['pysch5'], dtype='object')] are in the [columns]"

In [111]:
# one_hot_encode_feature(data,'cancer')
data[:5]

Unnamed: 0,administratorcost,urine,diabetes,income,extraprimary,bloodchem6,education,psych5,psych6,information,cancer_metastatic,cancer_no,cancer_yes
0,3525.0,5360.0,0.0,$11-$25k,COPD/CHF/Cirrhosis,167.5,20.0,30.0,2.0,0.0,0.0,1.0,0.0
1,43200.0,2570.0,0.0,>$50k,Cancer,480.0,16.0,11.5,1.0,10.0,1.0,0.0,0.0
2,5894.0,1690.0,1.0,under $11k,ARF/MOSF,177.125,5.0,18.0,0.0,5.0,0.0,0.0,1.0
3,16717.0,,0.0,$11-$25k,COPD/CHF/Cirrhosis,,12.0,7.0,1.839,12.0,0.0,1.0,0.0
4,10151.0,,0.0,under $11k,COPD/CHF/Cirrhosis,233.3125,2.0,7.0,6.0,12.0,0.0,1.0,0.0


In [99]:
def replace_column_with_columns(target_df, source_df, target_column_name, source_columns):
    if target_column_name in target_df.columns:
        target_df.drop(columns=[target_column_name], inplace=True)  # Remove the target column
        target_df[source_columns] = source_df  # Add the source columns to the target DataFrame
    else:
        print("Target column not found in the target DataFrame.")
    

In [100]:
replace_column_with_columns(data,one_hot,'cancer',one_hot.columns )
data[:5]

Unnamed: 0,administratorcost,urine,diabetes,income,extraprimary,bloodchem6,education,psych5,psych6,information,cancer_metastatic,cancer_no,cancer_yes
0,3525.0,5360.0,0.0,$11-$25k,COPD/CHF/Cirrhosis,167.5,20.0,30.0,2.0,0.0,0.0,1.0,0.0
1,43200.0,2570.0,0.0,>$50k,Cancer,480.0,16.0,11.5,1.0,10.0,1.0,0.0,0.0
2,5894.0,1690.0,1.0,under $11k,ARF/MOSF,177.125,5.0,18.0,0.0,5.0,0.0,0.0,1.0
3,16717.0,,0.0,$11-$25k,COPD/CHF/Cirrhosis,,12.0,7.0,1.839,12.0,0.0,1.0,0.0
4,10151.0,,0.0,under $11k,COPD/CHF/Cirrhosis,233.3125,2.0,7.0,6.0,12.0,0.0,1.0,0.0


In [88]:
print(data.columns[0])

administratorcost


In [78]:
data[:5]

Unnamed: 0,administratorcost,urine,diabetes,income,extraprimary,bloodchem6,education,psych5,psych6,information,cancer
0,3525.0,5360.0,0.0,$11-$25k,COPD/CHF/Cirrhosis,167.5,20.0,30.0,2.0,0.0,no
1,43200.0,2570.0,0.0,>$50k,Cancer,480.0,16.0,11.5,1.0,10.0,metastatic
2,5894.0,1690.0,1.0,under $11k,ARF/MOSF,177.125,5.0,18.0,0.0,5.0,yes
3,16717.0,,0.0,$11-$25k,COPD/CHF/Cirrhosis,,12.0,7.0,1.839,12.0,no
4,10151.0,,0.0,under $11k,COPD/CHF/Cirrhosis,233.3125,2.0,7.0,6.0,12.0,no


<h1>Lab

In [29]:
print(data.extraprimary.unique())

['COPD/CHF/Cirrhosis' 'Cancer' 'ARF/MOSF' 'Coma']


In [30]:
print(data['extraprimary'].isna().sum())

0


In [54]:
enc = preprocessing.OneHotEncoder()

enc.fit(pd.DataFrame(data['cancer']))
one_hot = enc.transform(data['cancer'])
one_hot



ValueError: X does not contain any features, but OneHotEncoder is expecting 1 features

In [49]:
type(np.array(data['cancer']))
np.array(data['cancer']).shape
np.array(data['cancer']).reshape(-1,1).shape

(7058, 1)

In [39]:
type(data)

pandas.core.frame.DataFrame

In [41]:
type(data['information'])

KeyError: ('cancer', 'information')

In [33]:
# Drop column B as it is now encoded
df = df.drop('B',axis = 1)
# Join the encoded df
df = df.join(one_hot)
df

NameError: name 'df' is not defined

In [61]:
def one_hot_encode_feature(dataframe, feature_name):
    """
    One-hot encodes a specified feature from a DataFrame.

    Parameters:
    - dataframe: The input DataFrame.
    - feature_name: The name of the feature to be one-hot encoded.

    Returns:
    - one_hot_df: A DataFrame containing the one-hot encoded feature.
    """

    # Select the specified feature from the DataFrame
    feature_to_encode = dataframe[feature_name]

    # Reshape the feature to have a 2D shape, required by OneHotEncoder
    feature_to_encode = feature_to_encode.values.reshape(-1, 1)

    # Create an instance of the OneHotEncoder
    encoder = OneHotEncoder(sparse=False)  # You can set sparse=True if you want a sparse matrix

    # Fit the encoder to the feature data
    encoder.fit(feature_to_encode)

    # Transform the feature data to one-hot encoded format
    one_hot_encoded = encoder.transform(feature_to_encode)

    # Convert the one-hot encoded data to a DataFrame for better visualization
    one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out([feature_name]))

    return one_hot_df

In [56]:
data[:5]

Unnamed: 0,administratorcost,urine,diabetes,income,extraprimary,bloodchem6,education,psych5,psych6,information,cancer
0,3525.0,5360.0,0.0,$11-$25k,COPD/CHF/Cirrhosis,167.5,20.0,30.0,2.0,0.0,no
1,43200.0,2570.0,0.0,>$50k,Cancer,480.0,16.0,11.5,1.0,10.0,metastatic
2,5894.0,1690.0,1.0,under $11k,ARF/MOSF,177.125,5.0,18.0,0.0,5.0,yes
3,16717.0,,0.0,$11-$25k,COPD/CHF/Cirrhosis,,12.0,7.0,1.839,12.0,no
4,10151.0,,0.0,under $11k,COPD/CHF/Cirrhosis,233.3125,2.0,7.0,6.0,12.0,no


In [63]:
one_hot_category = one_hot_encode_feature(data, 'cancer')
one_hot_category



Unnamed: 0,cancer_metastatic,cancer_no,cancer_yes
0,0.0,1.0,0.0
1,1.0,0.0,0.0
2,0.0,0.0,1.0
3,0.0,1.0,0.0
4,0.0,1.0,0.0
...,...,...,...
7053,0.0,1.0,0.0
7054,1.0,0.0,0.0
7055,0.0,1.0,0.0
7056,0.0,0.0,1.0


In [None]:
# for glucose, psych2, 
def clean_fill_mean(df, feature):
    mean_value = df[feature].mean()
    df[feature].fillna(mean_value, inplace=True)
    return df


#replacing the outliers after replacing missing values

def replace_outliers_with_mean(df, column_name, threshold=1.5):
    # Calculate lower and upper bounds for outliers
    Q1 = df[column_name].quantile(0.25)
    Q3 = df[column_name].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - threshold * IQR
    upper_bound = Q3 + threshold * IQR

    # Identify outliers in the specified column
    outliers = df[(df[column_name] < lower_bound) | (df[column_name] > upper_bound)]

    # Replace outliers with the mean of the column
    non_outliers_mean = df[(df[column_name] >= lower_bound) & (df[column_name] <= upper_bound)][column_name].mean()
    df.loc[outliers.index, column_name] = non_outliers_mean

    return df


#replacing missing values with knn imputer
# for totalcost
def replace_missing_with_knn(df, column_name, n_neighbors=5):
    # Create a copy of the DataFrame to avoid modifying the original data
    df_imputed = df.copy()    
    # Extract the column with missing values for imputation
    column_to_impute = df_imputed[[column_name]]   
    # Initialize KNNImputer with the desired number of neighbors
    imputer = KNNImputer(n_neighbors=n_neighbors)   
    # Perform KNN imputation on the specified column
    column_imputed = imputer.fit_transform(column_to_impute)   
    # Replace the missing values in the original DataFrame with imputed values
    df_imputed[column_name] = column_imputed
    return df_imputed

def replace_missing values(df, column_name,n_neighbors=5)
    col = df[column_name]
    print(col.isna().sum())

df = replace_missing_with_knn(df, 'totalcost', n_neighbors=10)
df = replace_missing_with_knn(df, 'confidence', n_neighbors=10)
df = clean_fill_mean(df, 'psych2')
df = clean_fill_mean(df, 'bloodchem3')
df = df.drop('glucose',axis =1)
df = df.drop('bloodchem4', axis =1)

In [160]:
null_counts = data.isnull().sum()
print(null_counts)

administratorcost    0
diabetes             0
extraprimary         0
bloodchem6           0
education            0
psych5               0
psych6               0
information          0
cancer               0
dtype: int64
