1) Filters out non-SMI diagnoses from 'uncombined_SMI.csv' (14 found), producing new 'filtered file' called 'SMI_uncombined_filtered.csv'.

2) Imports SMI data ('uncombined_SMI.csv'), removes 'non SMIs' and then structures patient data so that only 1 row per patient, rather than a seperate row for each different SMI. This new file is called 'Part2_SMI_data.csv'. Further, an alternative version of 'Part2_SMI_data.csv' is created where SMIs simplified into 4 categories. This file is called 'Part2_SMI_data_SMIs_Grouped.csv'

In [None]:
import os
os.listdir()

In [None]:
import pandas as pd
import numpy as np
import re

# Filtering out non-SMI diagnoses from dataset

In [None]:
#Create a list of SMIs to remove from the dataset

SMIs_to_remove = ['Western Germanic language','Germanic language','Asperger syndrome', '(Rel + psych illn) or (alc:[spouse]/[husb) or (schiz child)', '[V]Personal history of affective disorder', 'Unspecified schizoid personality disorder', 'Language disorder associated with thought disorder', 'Schizoid personality disorder NOS', 'FH: Schizophrenia', 'FH: Manic-depressive state', 'Schizoid personality disorder', 'Schizoid character', 'Schizophrenia association member', 'Manic-depression association member']

In [None]:
#Load in 'uncombined_SMI.csv' and remove 'non-SMIs'

df = pd.read_csv('uncombined_SMI.csv')

df.rename(columns={'CTV3Code': 'SMI_Code', 'CT3TermText': 'SMI_Description'}, inplace=True)
df = df.drop("Unnamed: 0",1)
df['SMI_Description'] = df['SMI_Description'].map(lambda x: re.sub(r'\t', '', x))

non_smi_indices = df[df['SMI_Description'].isin(SMIs_to_remove)].index.tolist()

df = df.drop(df.index[[non_smi_indices]])

df = df.reset_index(drop=True)

df1 = df.copy()
df1

In [None]:
#Confirm all unwanted SMIs removed (should only be 46 unique SMIs)

unique_SMIs_F = set(df1.SMI_Description)
len(unique_SMIs_F)

In [None]:
#Print list of unique SMIs

unique_SMIs_F

In [None]:
#Print 'Total Entries'

print("Total patient entries: ", len(df1['PatientId']))

#Print 'Unique patients'

print("Unique patients: ", len(set(df1['PatientId'])))


In [None]:
#Save df1 to and Excle File (CSV)

from pandas import ExcelWriter

df1.to_csv('SMI_uncombined_filtered.csv', sep=',')

# Combining all patient entries into a single row

In [None]:
#Reset index labels

df1 = df1.reset_index(drop=True)

#Locating duplicated PatientId (T/F) and printing how many duplicates remain
df1_p_dup = df1["PatientId"].duplicated()
print(df1_p_dup.sum())
    
#Creating a list of indices for non-duplicated PatientIds
false_index_list = df1_p_dup[df1_p_dup == False].index.tolist()
false_index_list
    
#Creating a new data frame of only non-duplicated values
dfa = df1.iloc[false_index_list]

#Remove non-repeated values from original df 
df2 = df1.drop(df1.index[[false_index_list]])
dfa = dfa[['PatientId','SMI_Code','SMI_Description','Date_Of_Diagnosis','Age_SMI_Diagnosed']]
dfa

In [None]:
#Repeat 1 

df2 = df2.reset_index(drop=True)
df2_p_dup = df2["PatientId"].duplicated()
print(df2_p_dup.sum())
false_index_list_2 = df2[df2_p_dup == False].index.tolist()
dfb = df2.iloc[false_index_list_2] 
df3 = df2.drop(df2.index[[false_index_list_2]])
dfb = dfb[['PatientId','SMI_Code','SMI_Description','Date_Of_Diagnosis','Age_SMI_Diagnosed']]
dfb.rename(columns={'SMI_Code': 'SMI_Code_2', 'SMI_Description': 'SMI_Description_2','Date_Of_Diagnosis': 'Date_Of_Diagnosis_2', 'Age_SMI_Diagnosed': 'Age_SMI_Diagnosed_2'}, inplace=True)
dfb

In [None]:
#Merge dfa and dfb

df_merge = dfa.merge(dfb, on =["PatientId"],how='left')
df_merge

In [None]:
#Repeat 2 (Keep repeating until N =0)

df3 = df3.reset_index(drop=True)
df3_p_dup = df3["PatientId"].duplicated()
print('N:', df3_p_dup.sum())
false_index_list_3 = df3[df3_p_dup == False].index.tolist()
dfc = df3.iloc[false_index_list_3] 
df4 = df3.drop(df3.index[[false_index_list_3]])
dfc = dfc[['PatientId','SMI_Code','SMI_Description','Date_Of_Diagnosis','Age_SMI_Diagnosed']]
dfc.rename(columns={'SMI_Code': 'SMI_Code_3', 'SMI_Description': 'SMI_Description_3','Date_Of_Diagnosis': 'Date_Of_Diagnosis_3', 'Age_SMI_Diagnosed': 'Age_SMI_Diagnosed_3'}, inplace=True)
dfc

In [None]:
#Merge 

df_merge_2 = df_merge.merge(dfc, on =["PatientId"], how ='left')

df_merge_2

In [None]:
#Repeat 3 (Keep repeating until N =0)

df4 = df4.reset_index(drop=True)
df4_p_dup = df4["PatientId"].duplicated()
print('N:', df4_p_dup.sum())
false_index_list_4 = df4[df4_p_dup == False].index.tolist()
dfd = df4.iloc[false_index_list_4] 
df5 = df4.drop(df4.index[[false_index_list_4]])
dfd = dfd[['PatientId','SMI_Code','SMI_Description','Date_Of_Diagnosis','Age_SMI_Diagnosed']]
dfd.rename(columns={'SMI_Code': 'SMI_Code_4', 'SMI_Description': 'SMI_Description_4','Date_Of_Diagnosis': 'Date_Of_Diagnosis_4', 'Age_SMI_Diagnosed': 'Age_SMI_Diagnosed_4'}, inplace=True)
dfd

In [None]:
print(len(dfd))

In [None]:
#Merge 

df_merge_3 = df_merge_2.merge(dfd, on =["PatientId"], how ='left')

df_merge_3

In [None]:
#Repeat 4 (Keep repeating until N =0)

df5 = df5.reset_index(drop=True)
df5_p_dup = df5["PatientId"].duplicated()
print('N:', df5_p_dup.sum())
false_index_list_5 = df5[df5_p_dup == False].index.tolist()
dfe = df5.iloc[false_index_list_5] 
df6 = df5.drop(df5.index[[false_index_list_5]])
dfe = dfe[['PatientId','SMI_Code','SMI_Description','Date_Of_Diagnosis','Age_SMI_Diagnosed']]
dfe.rename(columns={'SMI_Code': 'SMI_Code_5', 'SMI_Description': 'SMI_Description_5','Date_Of_Diagnosis': 'Date_Of_Diagnosis_5', 'Age_SMI_Diagnosed': 'Age_SMI_Diagnosed_5'}, inplace=True)
dfe

In [None]:
df_merge_4 = df_merge_3.merge(dfe, on =["PatientId"], how ='left')

df_merge_4

In [None]:
#Repeat 5 (Keep repeating until N =0)

df6 = df6.reset_index(drop=True)
df6_p_dup = df6["PatientId"].duplicated()
print('N:', df6_p_dup.sum())
false_index_list_6 = df6[df6_p_dup == False].index.tolist()
dff = df6.iloc[false_index_list_6] 
df7 = df6.drop(df6.index[[false_index_list_6]])
dff = dff[['PatientId','SMI_Code','SMI_Description','Date_Of_Diagnosis','Age_SMI_Diagnosed']]
dff.rename(columns={'SMI_Code': 'SMI_Code_6', 'SMI_Description': 'SMI_Description_6','Date_Of_Diagnosis': 'Date_Of_Diagnosis_6', 'Age_SMI_Diagnosed': 'Age_SMI_Diagnosed_6'}, inplace=True)
dff

In [None]:
SMI_data = df_merge_4.merge(dff, on =["PatientId"], how ='left')
SMI_data

In [None]:
#Export 'SMI_data' to Excel File (csv)

from pandas import ExcelWriter

SMI_data.to_csv('Part2_SMI_data.csv', sep=',')

In [None]:
#creating a unique patient list

unique_SMI_patients = SMI_data[['PatientId','Age_SMI_Diagnosed']]

print(len(unique_SMI_patients))

In [None]:

unique_SMI_patients

In [None]:
#Export 'unique_SMI_patients' to Excel File (csv)

unique_SMI_patients.to_csv('unique_smi_PatientList.csv', sep=',')

# Combining all patients into a single row - SMIs grouped into categories

In [None]:
#Read in 'characterising_smis.csv', which acts as a reference for how to categorise each SMI in the dataset

c_smis = pd.read_csv('characterising_smis.csv')
c_smis

In [None]:
#Creating a list of all SMIs that can be categorized as 'schizophrenia'

schizophrenia_list = c_smis.ix[c_smis['schizophrenia'] == True]
schizophrenia_list = list(schizophrenia_list['SMI'].map(lambda x: re.sub(r'\t', '', x)))
schizophrenia_list 

In [None]:
#Creating a list of all SMIs that can be categorized as 'bipolar'

bipolar_list = c_smis.ix[c_smis['bipolar disorder'] == True]
bipolar_list = list(bipolar_list['SMI'].map(lambda x: re.sub(r'\t', '', x)))
bipolar_list 

In [None]:
#Creating a list of all SMIs that can be categorized as 'other smi'

other_smi_list = c_smis.ix[c_smis['other SMI'] == True]
other_smi_list = list(other_smi_list['SMI'].map(lambda x: re.sub(r'\t', '', x)))
other_smi_list 

In [None]:
#Creating a list of all SMIs that can be categorized as 'unknown/unspecified'

unknown_smi_list = c_smis.ix[c_smis['Unknown/unspecified'] == True]
unknown_smi_list = list(unknown_smi_list['SMI'].map(lambda x: re.sub(r'\t', '', x)))
unknown_smi_list 

In [None]:
df1

In [None]:
#function to classify SMIs

def smi_category_sorting(SMI):
    if SMI in schizophrenia_list:
        return "Schizophrenia"
    elif SMI in bipolar_list:
        return "Bipolar"
    elif SMI in other_smi_list:
        return "Other SMI"
    elif SMI in unknown_smi_list:
        return "Unknown SMI"
    


In [None]:
#adding smi type to dataframe

df1['SMI_type'] = df1['SMI_Description'].apply(smi_category_sorting)
df1.head(101)

In [None]:
#Keeping only necessay columns

df_type = df1.copy()
df_type = df_type[['PatientId','SMI_type', 'Date_Of_Diagnosis', 'Age_SMI_Diagnosed']]
df_type

In [None]:
#Remove duplicates so that if 2 SMIs in same catagory diagnosed, only keep first diagnosis

dfx = df_type.copy().drop_duplicates(subset=['PatientId','SMI_type'], keep = 'first')
dfx

In [None]:
dfx = dfx.reset_index(drop=True)

#Locating duplicated PatientId (T/F) and printing how many duplicates remain
dfx_p_dup = dfx["PatientId"].duplicated()
print(dfx_p_dup.sum())
    
#Creating a list of indices for non-duplicated PatientIds
false_index_list_x = dfx_p_dup[dfx_p_dup == False].index.tolist()
false_index_list_x
    
#Creating a new data frame of only non-duplicated values
dfxa = dfx.iloc[false_index_list_x]

#Remove non-repeated values from original df 
dfx2 = dfx.drop(dfx.index[[false_index_list_x]])
dfxa

In [None]:
#Repeat 1 

dfx2 = dfx2.reset_index(drop=True)
dfx2_p_dup = dfx2["PatientId"].duplicated()
print(dfx2_p_dup.sum())
false_index_list_x2 = dfx2[dfx2_p_dup == False].index.tolist()
dfxb = dfx2.iloc[false_index_list_x2] 
#Rename dfb columns
#dfb = dfb.rename(columns={'CTV3Code_1': 'CTV3Code_2', 'CTV3TermText_1': 'CTV3TermText_2'})
dfx3 = dfx2.drop(dfx2.index[[false_index_list_x2]])
dfxb.rename(columns={'SMI_type': 'SMI_type_2','Date_Of_Diagnosis': 'Date_Of_Diagnosis_2', 'Age_SMI_Diagnosed': 'Age_SMI_Diagnosed_2'}, inplace=True)
dfxb

In [None]:
#Merge dfa and dfb

df_merge_x = dfxa.merge(dfxb, on =["PatientId"],how='left')
df_merge_x

In [None]:
#Repeat 2 (Keep repeating until N =0)

dfx3 = dfx3.reset_index(drop=True)
dfx3_p_dup = dfx3["PatientId"].duplicated()
print('N:', dfx3_p_dup.sum())
false_index_list_x3 = dfx3[dfx3_p_dup == False].index.tolist()
dfxc = dfx3.iloc[false_index_list_x3] 
#Rename dfb columns
#dfc = dfc.rename(columns={'CTV3Code_1': 'CTV3Code_3', 'CTV3TermText_1': 'CTV3TermText_3'})
dfx4 = dfx3.drop(dfx3.index[[false_index_list_x3]])
dfxc.rename(columns={'SMI_type': 'SMI_type_3','Date_Of_Diagnosis': 'Date_Of_Diagnosis_3', 'Age_SMI_Diagnosed': 'Age_SMI_Diagnosed_3'}, inplace=True)
dfxc

In [None]:
#Merge 

df_merge_x2 = df_merge_x.merge(dfxc, on =["PatientId"], how ='left')

df_merge_x2

In [None]:
#Export 'df_merge_x2' as Excel File (CSV)

df_merge_x2.to_csv('Part2_SMI_data_SMIs_Grouped.csv', sep=',')