For our classification and clustering we needed to split the sets of diagnoses and medications which were stored as string, to separate columns in the dataframe. <br>
These columns will get value of 0 or 1 (as one-hot encode) and a value of 0 indicates the absence of a diagnosis or medication, whereas a value of 1 signifies its presence. <br>
we need to do this split and save them to use later for the most 20, 50, 100 frequent diagnoses and medications. <br>
Here we prepare .CSVs containing those columns. 

In [1]:
# imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestRegressor

In [37]:
# Last version
class Split_Diagnosis_Medications:
    def __init__(self, path, no_freq_col):
        self.path = path
        self.no_freq_col = no_freq_col
        self.data_df = None
        self.data_id = None

        if self.no_freq_col is not None:
            self.no_freq_col = self.no_freq_col + 1
        else:
            self.no_freq_col = self.no_freq_col
    # Read data
    def load_data(self):
        #data is all the data which is raed
        data = pd.read_csv(self.path,)
        #print(data.shape)

        # data_df just contains the two columns related to Diagnosis and Medications
        self.data_df = data.iloc[:,-2:].copy(deep=True)
        self.data_id = data['episode_id'].astype(int).copy(deep=True)

     # Split Diagnose and Medication, and save new dataframe  
    def split_diagnosis_medication(self):
        self.load_data()

        Diag_Med = self.data_df
        lists_dict = {}
        actual_diag_list = []
        actual_med_list = []

        # Assuming we have 2 columns one related to diagnosis and one to medications  
        for col_index in range(2):
            col_name = Diag_Med.columns[col_index]

            #Diag_Med.iloc[:, col_index] = Diag_Med.iloc[:, col_index].apply(lambda d: [] if pd.isnull(d) else d)
            Diag_Med.iloc[:, col_index] = Diag_Med.iloc[:, col_index].astype(str)
            Diag_Med.iloc[:, col_index] = Diag_Med.iloc[:, col_index].str.replace("[", "")
            Diag_Med.iloc[:, col_index] = Diag_Med.iloc[:, col_index].str.replace("]", "")
            Diag_Med.iloc[:, col_index] = Diag_Med.iloc[:, col_index].str.replace("'", "")
            Diag_Med.iloc[:, col_index] = Diag_Med.iloc[:, col_index].str.replace(" ", "")
            Diag_Med.iloc[:, col_index] = Diag_Med.iloc[:, col_index].str.split(',')

            # Get the unique values and their counts
            unique_values = pd.Series([item for sublist in Diag_Med[col_name].tolist() for item in sublist])
            if self.no_freq_col is not None:
                top_columns = unique_values.value_counts().head(self.no_freq_col).index.tolist()
            else:
                top_columns = unique_values.unique().tolist()

            lists_dict[col_index] = top_columns
            
            if col_index == 0:
                actual_diag_list = lists_dict[col_index]
            elif col_index == 1:
                actual_med_list = lists_dict[col_index]

        # Create dummy columns for the specified diagnosis and medications
        for diag in actual_diag_list:
            Diag_Med[diag] = Diag_Med[Diag_Med.columns[0]].apply(lambda x: 1 if diag in x else 0)
        for med in actual_med_list:
            Diag_Med[med] = Diag_Med[Diag_Med.columns[1]].apply(lambda x: 1 if med in x else 0)

        # Drop the original diagnosis and medication columns
        Diag_Med = Diag_Med.drop(columns=[Diag_Med.columns[0], Diag_Med.columns[1]], axis=1)

        #Merging the obtained results with episode_id column
        episodeid_df = pd.DataFrame(self.data_id)
        Diag_Med_df = pd.concat([episodeid_df, Diag_Med], axis=1)
        
        # print(Diag_Med_df.shape)
        # print(Diag_Med.shape)
        # print(Diag_Med_df.columns)

        Diag_Med_df = Diag_Med_df.dropna(axis=1, how='all')
        Diag_Med_df = Diag_Med_df.loc[:, ~Diag_Med_df.columns.str.contains('Unnamed')]

        # print(Diag_Med_df.shape)
        # print(Diag_Med_df.columns)
        Diag_Med_df = Diag_Med_df.drop('nan', axis=1)

        #print(Diag_Med_df[Diag_Med_df['episode_id']==25034])
        return Diag_Med_df

In [38]:
#test
Split_Diagnosis_Medications('/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Full_ICD10_ATC.csv', 50).split_diagnosis_medication().to_csv('/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Dummies_ICD10_ATC_50.csv',index=False)
e = pd.read_csv("/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Dummies_ICD10_ATC_50.csv")
#e = e.drop('Unnamed: 1', axis=1)
print(len(e))
print(*e.columns)
# print(e.head())
print(len(e.columns))
# e.head()

  Diag_Med[med] = Diag_Med[Diag_Med.columns[1]].apply(lambda x: 1 if med in x else 0)
  Diag_Med[med] = Diag_Med[Diag_Med.columns[1]].apply(lambda x: 1 if med in x else 0)


22676
episode_id F900 F321 F952 F431 F901 F813 F432 F845 F810 F320 F401 E669 F438 F913 F938 F941 X6n0 F939 F422 K590 E109 F849 F402 F83 F322 F411 F811 F908 F412 F500 F951 F930 F980 F82 F981 F928 F439 G933 F501 F70 F929 F81 F419 F4325 F4322 J459 F421 F988 E668 F410 N06BA04 A06BA04 N06BA09 N05CH01 N06AB06 N06BA12 N06AB03 N05AX08 R06AD01 N05AH04 NO6BAO4 N05AX12 NO6BA04 N03AX09 N05AH03 N06AB10 N05AA02 N06AX03 A11EA N05CF01 A12AX C02AC02 N03AG01 N02CX02 H01BA02 NO6BAO9 A06AD65 N05BA01 N06AX16 N06BA02 N05AF03 N06AX11 S01AA01 N05CF02 N05BB01 G03AC09 N06AB04 G03AA09 N06AA04 R06AE07 G03AA07 A06AD11 N06AX12 N05CD02 D07BC01 N05BA04 B03AA07 A03FA01 N01BB20 D07AC13
101


In [39]:
# use the class to make the .csvs
Split_Diagnosis_Medications('/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Full_ICD10_ATC.csv', None).split_diagnosis_medication().to_csv('/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Dummies_ICD10_ATC_All.csv',index=False)
Split_Diagnosis_Medications('/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Full_ICD10_ATC.csv', 20).split_diagnosis_medication().to_csv('/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Dummies_ICD10_ATC_20.csv',index=False)
Split_Diagnosis_Medications('/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Full_ICD10_ATC.csv', 50).split_diagnosis_medication().to_csv('/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Dummies_ICD10_ATC_50.csv',index=False)
Split_Diagnosis_Medications('/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Full_ICD10_ATC.csv', 100).split_diagnosis_medication().to_csv('/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Dummies_ICD10_ATC_100.csv',index=False)

Split_Diagnosis_Medications('/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Full_Phecode_ATC.csv', None).split_diagnosis_medication().to_csv('/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Dummies_Phecode_ATC_All.csv',index=False)
Split_Diagnosis_Medications('/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Full_Phecode_ATC.csv', 20).split_diagnosis_medication().to_csv('/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Dummies_Phecode_ATC_20.csv',index=False)
Split_Diagnosis_Medications('/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Full_Phecode_ATC.csv', 50).split_diagnosis_medication().to_csv('/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Dummies_Phecode_ATC_50.csv',index=False)
Split_Diagnosis_Medications('/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Full_Phecode_ATC.csv', 100).split_diagnosis_medication().to_csv('/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Dummies_Phecode_ATC_100.csv',index=False)

Split_Diagnosis_Medications('/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Full_Phecode_ATC4.csv', None).split_diagnosis_medication().to_csv('/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Dummies_Phecode_ATC4_All.csv',index=False)
Split_Diagnosis_Medications('/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Full_Phecode_ATC4.csv', 20).split_diagnosis_medication().to_csv('/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Dummies_Phecode_ATC4_20.csv',index=False)
Split_Diagnosis_Medications('/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Full_Phecode_ATC4.csv', 50).split_diagnosis_medication().to_csv('/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Dummies_Phecode_ATC4_50.csv',index=False)
Split_Diagnosis_Medications('/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Full_Phecode_ATC4.csv', 100).split_diagnosis_medication().to_csv('/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Dummies_Phecode_ATC4_100.csv',index=False)

Split_Diagnosis_Medications('/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Full_ICD10_ATC4.csv', None).split_diagnosis_medication().to_csv('/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Dummies_ICD10_ATC4_All.csv',index=False)
Split_Diagnosis_Medications('/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Full_ICD10_ATC4.csv', 20).split_diagnosis_medication().to_csv('/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Dummies_ICD10_ATC4_20.csv',index=False)
Split_Diagnosis_Medications('/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Full_ICD10_ATC4.csv', 50).split_diagnosis_medication().to_csv('/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Dummies_ICD10_ATC4_50.csv',index=False)
Split_Diagnosis_Medications('/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Full_ICD10_ATC4.csv', 100).split_diagnosis_medication().to_csv('/home/kabank/workbench/.conda/analysis/kabank-data/new-data/Dummies_ICD10_ATC4_100.csv',index=False)


  Diag_Med[diag] = Diag_Med[Diag_Med.columns[0]].apply(lambda x: 1 if diag in x else 0)
  Diag_Med[diag] = Diag_Med[Diag_Med.columns[0]].apply(lambda x: 1 if diag in x else 0)
  Diag_Med[diag] = Diag_Med[Diag_Med.columns[0]].apply(lambda x: 1 if diag in x else 0)
  Diag_Med[diag] = Diag_Med[Diag_Med.columns[0]].apply(lambda x: 1 if diag in x else 0)
  Diag_Med[diag] = Diag_Med[Diag_Med.columns[0]].apply(lambda x: 1 if diag in x else 0)
  Diag_Med[diag] = Diag_Med[Diag_Med.columns[0]].apply(lambda x: 1 if diag in x else 0)
  Diag_Med[diag] = Diag_Med[Diag_Med.columns[0]].apply(lambda x: 1 if diag in x else 0)
  Diag_Med[diag] = Diag_Med[Diag_Med.columns[0]].apply(lambda x: 1 if diag in x else 0)
  Diag_Med[diag] = Diag_Med[Diag_Med.columns[0]].apply(lambda x: 1 if diag in x else 0)
  Diag_Med[diag] = Diag_Med[Diag_Med.columns[0]].apply(lambda x: 1 if diag in x else 0)
  Diag_Med[diag] = Diag_Med[Diag_Med.columns[0]].apply(lambda x: 1 if diag in x else 0)
  Diag_Med[diag] = Diag_Med[Diag