1) Filters out non-SMI diagnoses from 'uncombined_SMI.csv' (14 found), producing new 'filtered file' called 'SMI_uncombined_filtered.csv'.

2) Imports SMI data ('uncombined_SMI.csv'), removes 'non SMIs' and then structures patient data so that only 1 row per patient, rather than a seperate row for each different SMI. This new file is called 'Part2_SMI_data.csv'. Further, an alternative version of 'Part2_SMI_data.csv' is created where SMIs simplified into 4 categories. This file is called 'Part2_SMI_data_SMIs_Grouped.csv'

In [4]:
import os
os.listdir()

['.ipynb_checkpoints',
 'AHSN_Final_Taylor_Additional_Coded_Events_Total.csv',
 'AHSN_Final_Taylor_Additional_Coded_Events_Total2.txt',
 'AHSN_Final_Taylor_AddressHistory_Total.csv',
 'AHSN_Final_Taylor_AddressHistory_Total.txt',
 'AHSN_Final_Taylor_Appointments_Total.csv',
 'AHSN_Final_Taylor_Diagnostic_Codes_Total.csv',
 'AHSN_Final_Taylor_Diagnostic_Codes_Total.txt',
 'AHSN_Final_Taylor_Ethnicity_Total.csv',
 'AHSN_Final_Taylor_Ethnicity_Total.txt',
 'AHSN_Final_Taylor_MedicationIssues_Total.csv',
 'AHSN_Final_Taylor_MedicationRepeatTemplates_Total.csv',
 'AHSN_Final_Taylor_Patient_Total.csv',
 'AHSN_Final_Taylor_Patient_Total.txt',
 'AHSN_Final_Taylor_Registration_History_Total.csv',
 'AHSN_Final_Taylor_Registration_History_Total.txt',
 'AHSN_Taylor_Code_Analysis.xlsx',
 'AHSN_Taylor_Code_Analysis_Extra_CTV3C_Codes (1).xlsx',
 'all_data.csv',
 'all_data.xlsx',
 'characterising_smis.csv',
 'characterising_smis.xlsx',
 'FinalPart_combining_dataframes.ipynb',
 'first_diagnosis.csv',
 

In [5]:
import pandas as pd
import numpy as np
import re

# Filtering out non-SMI diagnoses from dataset

In [84]:
#Create a list of SMIs to remove from the dataset

SMIs_to_remove = ['Western Germanic language','Germanic language','Asperger syndrome', '(Rel + psych illn) or (alc:[spouse]/[husb) or (schiz child)', '[V]Personal history of affective disorder', 'Unspecified schizoid personality disorder', 'Language disorder associated with thought disorder', 'Schizoid personality disorder NOS', 'FH: Schizophrenia', 'FH: Manic-depressive state', 'Schizoid personality disorder', 'Schizoid character', 'Schizophrenia association member', 'Manic-depression association member']

In [85]:
#Load in 'uncombined_SMI.csv' and remove 'non-SMIs'

df = pd.read_csv('uncombined_SMI.csv')

df.rename(columns={'CTV3Code': 'SMI_Code', 'CT3TermText': 'SMI_Description'}, inplace=True)
df = df.drop("Unnamed: 0",1)
df['SMI_Description'] = df['SMI_Description'].map(lambda x: re.sub(r'\t', '', x))

non_smi_indices = df[df['SMI_Description'].isin(SMIs_to_remove)].index.tolist()

df = df.drop(df.index[[non_smi_indices]])

df = df.reset_index(drop=True)

df1 = df.copy()
df1

Unnamed: 0,PatientId,DateOfBirth,DateOfDeath,Gender,Age_at_Death,Date_Of_Diagnosis,SMI_Code,SMI_Description,Age_SMI_Diagnosed
0,41751,1943-09-01 00:00:00.000,,F,-,2000-10-20,X00SL,Hypomania,57
1,41751,1943-09-01 00:00:00.000,,F,-,2011-01-10,X00SM,Bipolar disorder,67
2,43655,1947-09-01 00:00:00.000,2009-06-01 00:00:00.000,M,61,2004-12-03,X00SL,Hypomania,57
3,44191,1950-01-01 00:00:00.000,2014-01-01 00:00:00.000,M,64,2012-06-07,X00SM,Bipolar disorder,62
4,50463,1961-08-01 00:00:00.000,,M,-,2004-07-29,X00SJ,Mania,43
5,50463,1961-08-01 00:00:00.000,,M,-,2008-02-27,XE1ZW,[X]Manic episode unspecified,46
6,50463,1961-08-01 00:00:00.000,,M,-,2008-06-25,X00SM,Bipolar disorder,46
7,66851,1962-02-01 00:00:00.000,,F,-,2010-09-27,X00SM,Bipolar disorder,48
8,88679,1962-11-01 00:00:00.000,,M,-,1982-01-01,XaIx7,H/O: manic depressive disorder,19
9,88679,1962-11-01 00:00:00.000,,M,-,2010-01-01,X00SL,Hypomania,47


In [86]:
#Confirm all unwanted SMIs removed (should only be 46 unique SMIs)

unique_SMIs_F = set(df1.SMI_Description)
len(unique_SMIs_F)

46

In [87]:
#Print list of unique SMIs

unique_SMIs_F

{'(Mania) or (hypomania) or (agitated depression)',
 'Acute exacerbation of chronic latent schizophrenia',
 'Acute schizophrenia-like psychotic disorder',
 'Acute schizophrenic episode',
 'Affective psychoses:[manic depress] or [involut melancholia]',
 'Bipolar',
 'Bipolar I disorder',
 'Bipolar II disorder',
 'Bipolar affective disorder resolved',
 'Bipolar diathermy',
 'Bipolar disorder',
 'Borderline schizophrenia',
 'Childhood schizophrenia NOS',
 'H/O: manic depressive disorder',
 'H/O: schizophrenia',
 'Hypomania',
 'Hypomanic behaviour',
 'Hypomanic mood',
 'Hypomanic personality disorder',
 'Mania',
 'Manic behaviour',
 'Manic disorder single episode',
 'Manic mood',
 'Organic bipolar disorder',
 'Organic manic disorder',
 'Other manic-depressive psychos',
 'Other schizophrenia',
 'Post-schizophrenic depression',
 'Schizoaffective disorder depressive type',
 'Schizoaffective disorder manic type',
 'Schizoaffective disorder mixed type',
 'Schizophrenia resolved',
 'Schizophrenic

In [88]:
#Print 'Total Entries'

print("Total patient entries: ", len(df1['PatientId']))

#Print 'Unique patients'

print("Unique patients: ", len(set(df1['PatientId'])))


Total patient entries:  11187
Unique patients:  9078


In [89]:
#Save df1 to and Excle File (CSV)

from pandas import ExcelWriter

df1.to_csv('SMI_uncombined_filtered.csv', sep=',')

# Combining all patient entries into a single row

In [90]:
#Reset index labels

df1 = df1.reset_index(drop=True)

#Locating duplicated PatientId (T/F) and printing how many duplicates remain
df1_p_dup = df1["PatientId"].duplicated()
print(df1_p_dup.sum())
    
#Creating a list of indices for non-duplicated PatientIds
false_index_list = df1_p_dup[df1_p_dup == False].index.tolist()
false_index_list
    
#Creating a new data frame of only non-duplicated values
dfa = df1.iloc[false_index_list]

#Remove non-repeated values from original df 
df2 = df1.drop(df1.index[[false_index_list]])
dfa = dfa[['PatientId','SMI_Code','SMI_Description','Date_Of_Diagnosis','Age_SMI_Diagnosed']]
dfa

2109


Unnamed: 0,PatientId,SMI_Code,SMI_Description,Date_Of_Diagnosis,Age_SMI_Diagnosed
0,41751,X00SL,Hypomania,2000-10-20,57
2,43655,X00SL,Hypomania,2004-12-03,57
3,44191,X00SM,Bipolar disorder,2012-06-07,62
4,50463,X00SJ,Mania,2004-07-29,43
7,66851,X00SM,Bipolar disorder,2010-09-27,48
8,88679,XaIx7,H/O: manic depressive disorder,1982-01-01,19
10,98715,XaY1Y,Bipolar I disorder,2012-01-06,41
13,124863,X00SM,Bipolar disorder,2009-07-06,73
14,125027,Xa0s9,Acute schizophrenia-like psychotic disorder,2009-10-19,31
15,128975,XE2uT,Schizoaffective disorder manic type,1991-11-22,53


In [91]:
#Repeat 1 

df2 = df2.reset_index(drop=True)
df2_p_dup = df2["PatientId"].duplicated()
print(df2_p_dup.sum())
false_index_list_2 = df2[df2_p_dup == False].index.tolist()
dfb = df2.iloc[false_index_list_2] 
df3 = df2.drop(df2.index[[false_index_list_2]])
dfb = dfb[['PatientId','SMI_Code','SMI_Description','Date_Of_Diagnosis','Age_SMI_Diagnosed']]
dfb.rename(columns={'SMI_Code': 'SMI_Code_2', 'SMI_Description': 'SMI_Description_2','Date_Of_Diagnosis': 'Date_Of_Diagnosis_2', 'Age_SMI_Diagnosed': 'Age_SMI_Diagnosed_2'}, inplace=True)
dfb

404


Unnamed: 0,PatientId,SMI_Code_2,SMI_Description_2,Date_Of_Diagnosis_2,Age_SMI_Diagnosed_2
0,41751,X00SM,Bipolar disorder,2011-01-10,67
1,50463,XE1ZW,[X]Manic episode unspecified,2008-02-27,46
3,88679,X00SL,Hypomania,2010-01-01,47
4,98715,X00SM,Bipolar disorder,2012-02-16,41
6,128975,XaIx7,H/O: manic depressive disorder,1994-04-16,55
8,131787,X00SM,Bipolar disorder,2003-06-12,37
9,137107,X00SM,Bipolar disorder,2004-12-29,55
10,181911,XE1ZW,[X]Manic episode unspecified,2001-10-19,32
11,182339,XaCHo,[X]Manic-depress psychosisdepressdno psychotic...,1986-03-04,53
12,187511,X00SM,Bipolar disorder,1989-06-21,48


In [92]:
#Merge dfa and dfb

df_merge = dfa.merge(dfb, on =["PatientId"],how='left')
df_merge

Unnamed: 0,PatientId,SMI_Code,SMI_Description,Date_Of_Diagnosis,Age_SMI_Diagnosed,SMI_Code_2,SMI_Description_2,Date_Of_Diagnosis_2,Age_SMI_Diagnosed_2
0,41751,X00SL,Hypomania,2000-10-20,57,X00SM,Bipolar disorder,2011-01-10,67.0
1,43655,X00SL,Hypomania,2004-12-03,57,,,,
2,44191,X00SM,Bipolar disorder,2012-06-07,62,,,,
3,50463,X00SJ,Mania,2004-07-29,43,XE1ZW,[X]Manic episode unspecified,2008-02-27,46.0
4,66851,X00SM,Bipolar disorder,2010-09-27,48,,,,
5,88679,XaIx7,H/O: manic depressive disorder,1982-01-01,19,X00SL,Hypomania,2010-01-01,47.0
6,98715,XaY1Y,Bipolar I disorder,2012-01-06,41,X00SM,Bipolar disorder,2012-02-16,41.0
7,124863,X00SM,Bipolar disorder,2009-07-06,73,,,,
8,125027,Xa0s9,Acute schizophrenia-like psychotic disorder,2009-10-19,31,,,,
9,128975,XE2uT,Schizoaffective disorder manic type,1991-11-22,53,XaIx7,H/O: manic depressive disorder,1994-04-16,55.0


In [93]:
#Repeat 2 (Keep repeating until N =0)

df3 = df3.reset_index(drop=True)
df3_p_dup = df3["PatientId"].duplicated()
print('N:', df3_p_dup.sum())
false_index_list_3 = df3[df3_p_dup == False].index.tolist()
dfc = df3.iloc[false_index_list_3] 
df4 = df3.drop(df3.index[[false_index_list_3]])
dfc = dfc[['PatientId','SMI_Code','SMI_Description','Date_Of_Diagnosis','Age_SMI_Diagnosed']]
dfc.rename(columns={'SMI_Code': 'SMI_Code_3', 'SMI_Description': 'SMI_Description_3','Date_Of_Diagnosis': 'Date_Of_Diagnosis_3', 'Age_SMI_Diagnosed': 'Age_SMI_Diagnosed_3'}, inplace=True)
dfc

N: 74


Unnamed: 0,PatientId,SMI_Code_3,SMI_Description_3,Date_Of_Diagnosis_3,Age_SMI_Diagnosed_3
0,50463,X00SM,Bipolar disorder,2008-06-25,46
1,98715,X00SL,Hypomania,2015-11-27,45
2,128975,X00SM,Bipolar disorder,2004-12-29,66
3,187511,XE1ZV,[X]Mania with psychotic symptoms,2005-05-02,64
4,275187,X00SJ,Mania,2004-06-04,41
5,388919,X00SL,Hypomania,2011-12-05,61
6,833643,X00SM,Bipolar disorder,1999-12-08,67
7,906567,X00SJ,Mania,1988-05-17,57
10,907251,XaMwc,Bipolar affective disorder resolved,2007-11-13,49
12,985983,X00SM,Bipolar disorder,2007-02-21,57


In [94]:
#Merge 

df_merge_2 = df_merge.merge(dfc, on =["PatientId"], how ='left')

df_merge_2

Unnamed: 0,PatientId,SMI_Code,SMI_Description,Date_Of_Diagnosis,Age_SMI_Diagnosed,SMI_Code_2,SMI_Description_2,Date_Of_Diagnosis_2,Age_SMI_Diagnosed_2,SMI_Code_3,SMI_Description_3,Date_Of_Diagnosis_3,Age_SMI_Diagnosed_3
0,41751,X00SL,Hypomania,2000-10-20,57,X00SM,Bipolar disorder,2011-01-10,67.0,,,,
1,43655,X00SL,Hypomania,2004-12-03,57,,,,,,,,
2,44191,X00SM,Bipolar disorder,2012-06-07,62,,,,,,,,
3,50463,X00SJ,Mania,2004-07-29,43,XE1ZW,[X]Manic episode unspecified,2008-02-27,46.0,X00SM,Bipolar disorder,2008-06-25,46.0
4,66851,X00SM,Bipolar disorder,2010-09-27,48,,,,,,,,
5,88679,XaIx7,H/O: manic depressive disorder,1982-01-01,19,X00SL,Hypomania,2010-01-01,47.0,,,,
6,98715,XaY1Y,Bipolar I disorder,2012-01-06,41,X00SM,Bipolar disorder,2012-02-16,41.0,X00SL,Hypomania,2015-11-27,45.0
7,124863,X00SM,Bipolar disorder,2009-07-06,73,,,,,,,,
8,125027,Xa0s9,Acute schizophrenia-like psychotic disorder,2009-10-19,31,,,,,,,,
9,128975,XE2uT,Schizoaffective disorder manic type,1991-11-22,53,XaIx7,H/O: manic depressive disorder,1994-04-16,55.0,X00SM,Bipolar disorder,2004-12-29,66.0


In [95]:
#Repeat 3 (Keep repeating until N =0)

df4 = df4.reset_index(drop=True)
df4_p_dup = df4["PatientId"].duplicated()
print('N:', df4_p_dup.sum())
false_index_list_4 = df4[df4_p_dup == False].index.tolist()
dfd = df4.iloc[false_index_list_4] 
df5 = df4.drop(df4.index[[false_index_list_4]])
dfd = dfd[['PatientId','SMI_Code','SMI_Description','Date_Of_Diagnosis','Age_SMI_Diagnosed']]
dfd.rename(columns={'SMI_Code': 'SMI_Code_4', 'SMI_Description': 'SMI_Description_4','Date_Of_Diagnosis': 'Date_Of_Diagnosis_4', 'Age_SMI_Diagnosed': 'Age_SMI_Diagnosed_4'}, inplace=True)
dfd

N: 14


Unnamed: 0,PatientId,SMI_Code_4,SMI_Description_4,Date_Of_Diagnosis_4,Age_SMI_Diagnosed_4
0,906567,X00SM,Bipolar disorder,2005-02-16,74
2,907251,XE1ZW,[X]Manic episode unspecified,2008-11-03,50
3,1139947,X9078,Bipolar,2006-01-26,31
4,2404219,Xa1hV,Hypomanic mood,2011-07-22,58
5,2643027,X00SM,Bipolar disorder,2007-09-10,46
6,3205683,XE1aM,Schizophrenic psychoses (& [paranoid schizophr...,2000-05-22,31
7,3217291,X00SL,Hypomania,2005-08-04,39
8,3234487,XaIx7,H/O: manic depressive disorder,2005-09-29,82
9,4218819,XE1aQ,Affective psychoses:[manic depress] or [involu...,2000-02-22,47
10,6034475,X9078,Bipolar,2013-01-23,62


In [96]:
print(len(dfd))

60


In [97]:
#Merge 

df_merge_3 = df_merge_2.merge(dfd, on =["PatientId"], how ='left')

df_merge_3

Unnamed: 0,PatientId,SMI_Code,SMI_Description,Date_Of_Diagnosis,Age_SMI_Diagnosed,SMI_Code_2,SMI_Description_2,Date_Of_Diagnosis_2,Age_SMI_Diagnosed_2,SMI_Code_3,SMI_Description_3,Date_Of_Diagnosis_3,Age_SMI_Diagnosed_3,SMI_Code_4,SMI_Description_4,Date_Of_Diagnosis_4,Age_SMI_Diagnosed_4
0,41751,X00SL,Hypomania,2000-10-20,57,X00SM,Bipolar disorder,2011-01-10,67.0,,,,,,,,
1,43655,X00SL,Hypomania,2004-12-03,57,,,,,,,,,,,,
2,44191,X00SM,Bipolar disorder,2012-06-07,62,,,,,,,,,,,,
3,50463,X00SJ,Mania,2004-07-29,43,XE1ZW,[X]Manic episode unspecified,2008-02-27,46.0,X00SM,Bipolar disorder,2008-06-25,46.0,,,,
4,66851,X00SM,Bipolar disorder,2010-09-27,48,,,,,,,,,,,,
5,88679,XaIx7,H/O: manic depressive disorder,1982-01-01,19,X00SL,Hypomania,2010-01-01,47.0,,,,,,,,
6,98715,XaY1Y,Bipolar I disorder,2012-01-06,41,X00SM,Bipolar disorder,2012-02-16,41.0,X00SL,Hypomania,2015-11-27,45.0,,,,
7,124863,X00SM,Bipolar disorder,2009-07-06,73,,,,,,,,,,,,
8,125027,Xa0s9,Acute schizophrenia-like psychotic disorder,2009-10-19,31,,,,,,,,,,,,
9,128975,XE2uT,Schizoaffective disorder manic type,1991-11-22,53,XaIx7,H/O: manic depressive disorder,1994-04-16,55.0,X00SM,Bipolar disorder,2004-12-29,66.0,,,,


In [98]:
#Repeat 4 (Keep repeating until N =0)

df5 = df5.reset_index(drop=True)
df5_p_dup = df5["PatientId"].duplicated()
print('N:', df5_p_dup.sum())
false_index_list_5 = df5[df5_p_dup == False].index.tolist()
dfe = df5.iloc[false_index_list_5] 
df6 = df5.drop(df5.index[[false_index_list_5]])
dfe = dfe[['PatientId','SMI_Code','SMI_Description','Date_Of_Diagnosis','Age_SMI_Diagnosed']]
dfe.rename(columns={'SMI_Code': 'SMI_Code_5', 'SMI_Description': 'SMI_Description_5','Date_Of_Diagnosis': 'Date_Of_Diagnosis_5', 'Age_SMI_Diagnosed': 'Age_SMI_Diagnosed_5'}, inplace=True)
dfe

N: 2


Unnamed: 0,PatientId,SMI_Code_5,SMI_Description_5,Date_Of_Diagnosis_5,Age_SMI_Diagnosed_5
0,906567,XaCHo,[X]Manic-depress psychosisdepressdno psychotic...,2006-09-25,76
1,9057919,X00SM,Bipolar disorder,2006-03-24,61
3,30496291,XaY1Y,Bipolar I disorder,2014-01-10,53
4,39962559,X00SM,Bipolar disorder,2010-08-26,53
5,47131947,X00SM,Bipolar disorder,2010-08-04,74
6,64919155,X00SM,Bipolar disorder,2014-05-29,52
7,66565043,XE1ZX,[X]Other bipolar affective disorders,1900-01-01,-9
8,66969047,XaMwc,Bipolar affective disorder resolved,2013-08-08,23
9,68441751,E2111,Hypomanic personality disorder,2015-01-05,71
10,82604083,XaY1Y,Bipolar I disorder,2014-12-02,47


In [99]:
df_merge_4 = df_merge_3.merge(dfe, on =["PatientId"], how ='left')

df_merge_4

Unnamed: 0,PatientId,SMI_Code,SMI_Description,Date_Of_Diagnosis,Age_SMI_Diagnosed,SMI_Code_2,SMI_Description_2,Date_Of_Diagnosis_2,Age_SMI_Diagnosed_2,SMI_Code_3,...,Date_Of_Diagnosis_3,Age_SMI_Diagnosed_3,SMI_Code_4,SMI_Description_4,Date_Of_Diagnosis_4,Age_SMI_Diagnosed_4,SMI_Code_5,SMI_Description_5,Date_Of_Diagnosis_5,Age_SMI_Diagnosed_5
0,41751,X00SL,Hypomania,2000-10-20,57,X00SM,Bipolar disorder,2011-01-10,67.0,,...,,,,,,,,,,
1,43655,X00SL,Hypomania,2004-12-03,57,,,,,,...,,,,,,,,,,
2,44191,X00SM,Bipolar disorder,2012-06-07,62,,,,,,...,,,,,,,,,,
3,50463,X00SJ,Mania,2004-07-29,43,XE1ZW,[X]Manic episode unspecified,2008-02-27,46.0,X00SM,...,2008-06-25,46.0,,,,,,,,
4,66851,X00SM,Bipolar disorder,2010-09-27,48,,,,,,...,,,,,,,,,,
5,88679,XaIx7,H/O: manic depressive disorder,1982-01-01,19,X00SL,Hypomania,2010-01-01,47.0,,...,,,,,,,,,,
6,98715,XaY1Y,Bipolar I disorder,2012-01-06,41,X00SM,Bipolar disorder,2012-02-16,41.0,X00SL,...,2015-11-27,45.0,,,,,,,,
7,124863,X00SM,Bipolar disorder,2009-07-06,73,,,,,,...,,,,,,,,,,
8,125027,Xa0s9,Acute schizophrenia-like psychotic disorder,2009-10-19,31,,,,,,...,,,,,,,,,,
9,128975,XE2uT,Schizoaffective disorder manic type,1991-11-22,53,XaIx7,H/O: manic depressive disorder,1994-04-16,55.0,X00SM,...,2004-12-29,66.0,,,,,,,,


In [100]:
#Repeat 5 (Keep repeating until N =0)

df6 = df6.reset_index(drop=True)
df6_p_dup = df6["PatientId"].duplicated()
print('N:', df6_p_dup.sum())
false_index_list_6 = df6[df6_p_dup == False].index.tolist()
dff = df6.iloc[false_index_list_6] 
df7 = df6.drop(df6.index[[false_index_list_6]])
dff = dff[['PatientId','SMI_Code','SMI_Description','Date_Of_Diagnosis','Age_SMI_Diagnosed']]
dff.rename(columns={'SMI_Code': 'SMI_Code_6', 'SMI_Description': 'SMI_Description_6','Date_Of_Diagnosis': 'Date_Of_Diagnosis_6', 'Age_SMI_Diagnosed': 'Age_SMI_Diagnosed_6'}, inplace=True)
dff

N: 0


Unnamed: 0,PatientId,SMI_Code_6,SMI_Description_6,Date_Of_Diagnosis_6,Age_SMI_Diagnosed_6
0,9057919,XE2uT,Schizoaffective disorder manic type,2011-08-05,66
1,151829271,XE1ZW,[X]Manic episode unspecified,2000-11-22,44


In [101]:
SMI_data = df_merge_4.merge(dff, on =["PatientId"], how ='left')
SMI_data

Unnamed: 0,PatientId,SMI_Code,SMI_Description,Date_Of_Diagnosis,Age_SMI_Diagnosed,SMI_Code_2,SMI_Description_2,Date_Of_Diagnosis_2,Age_SMI_Diagnosed_2,SMI_Code_3,...,Date_Of_Diagnosis_4,Age_SMI_Diagnosed_4,SMI_Code_5,SMI_Description_5,Date_Of_Diagnosis_5,Age_SMI_Diagnosed_5,SMI_Code_6,SMI_Description_6,Date_Of_Diagnosis_6,Age_SMI_Diagnosed_6
0,41751,X00SL,Hypomania,2000-10-20,57,X00SM,Bipolar disorder,2011-01-10,67.0,,...,,,,,,,,,,
1,43655,X00SL,Hypomania,2004-12-03,57,,,,,,...,,,,,,,,,,
2,44191,X00SM,Bipolar disorder,2012-06-07,62,,,,,,...,,,,,,,,,,
3,50463,X00SJ,Mania,2004-07-29,43,XE1ZW,[X]Manic episode unspecified,2008-02-27,46.0,X00SM,...,,,,,,,,,,
4,66851,X00SM,Bipolar disorder,2010-09-27,48,,,,,,...,,,,,,,,,,
5,88679,XaIx7,H/O: manic depressive disorder,1982-01-01,19,X00SL,Hypomania,2010-01-01,47.0,,...,,,,,,,,,,
6,98715,XaY1Y,Bipolar I disorder,2012-01-06,41,X00SM,Bipolar disorder,2012-02-16,41.0,X00SL,...,,,,,,,,,,
7,124863,X00SM,Bipolar disorder,2009-07-06,73,,,,,,...,,,,,,,,,,
8,125027,Xa0s9,Acute schizophrenia-like psychotic disorder,2009-10-19,31,,,,,,...,,,,,,,,,,
9,128975,XE2uT,Schizoaffective disorder manic type,1991-11-22,53,XaIx7,H/O: manic depressive disorder,1994-04-16,55.0,X00SM,...,,,,,,,,,,


In [102]:
#Export 'SMI_data' to Excel File (csv)

from pandas import ExcelWriter

SMI_data.to_csv('Part2_SMI_data.csv', sep=',')

In [103]:
#creating a unique patient list

unique_SMI_patients = SMI_data[['PatientId','Age_SMI_Diagnosed']]

print(len(unique_SMI_patients))

9078


In [104]:

unique_SMI_patients

Unnamed: 0,PatientId,Age_SMI_Diagnosed
0,41751,57
1,43655,57
2,44191,62
3,50463,43
4,66851,48
5,88679,19
6,98715,41
7,124863,73
8,125027,31
9,128975,53


In [105]:
#Export 'unique_SMI_patients' to Excel File (csv)

unique_SMI_patients.to_csv('unique_smi_PatientList.csv', sep=',')

# Combining all patients into a single row - SMIs grouped into categories

In [106]:
#Read in 'characterising_smis.csv', which acts as a reference for how to categorise each SMI in the dataset

c_smis = pd.read_csv('characterising_smis.csv')
c_smis

Unnamed: 0,SMI,schizophrenia,bipolar disorder,other SMI,Unknown/unspecified,Unnamed: 5,Unnamed: 6
0,Bipolar affective disorder resolved\t\t,,1.0,,,,
1,Organic manic disorder\t\t,,1.0,,,,
2,Borderline schizophrenia\t\t,,,1.0,,,
3,Schizoaffective disorder\t depressive type\t,,,1.0,,,
4,Bipolar II disorder\t\t,,1.0,,,,
5,Mania\t\t,,1.0,,,,
6,Schizophrenic psychoses (& [paranoid schizophr...,1.0,,,,,
7,(Mania) or (hypomania) or (agitated depression...,,1.0,,,,
8,[V]Personal history of schizophrenia\t\t,1.0,,,,,
9,Bipolar disorder\t\t,,1.0,,,,


In [107]:
#Creating a list of all SMIs that can be categorized as 'schizophrenia'

schizophrenia_list = c_smis.ix[c_smis['schizophrenia'] == True]
schizophrenia_list = list(schizophrenia_list['SMI'].map(lambda x: re.sub(r'\t', '', x)))
schizophrenia_list 

['Schizophrenic psychoses (& [paranoid schizophrenia])',
 '[V]Personal history of schizophrenia',
 'Schizophrenic prodrome',
 'Schizophrenic child',
 'Childhood schizophrenia NOS',
 'Acute schizophrenic episode',
 'Other schizophrenia',
 '[X]Schizophrenia schizotypal and delusional disorders',
 'Post-schizophrenic depression',
 'Subchronic paranoid schizophrenia',
 'H/O: schizophrenia',
 'Schizophrenia resolved',
 'Acute exacerbation of chronic latent schizophrenia',
 '[X]Other schizophrenia']

In [108]:
#Creating a list of all SMIs that can be categorized as 'bipolar'

bipolar_list = c_smis.ix[c_smis['bipolar disorder'] == True]
bipolar_list = list(bipolar_list['SMI'].map(lambda x: re.sub(r'\t', '', x)))
bipolar_list 

['Bipolar affective disorder resolved',
 'Organic manic disorder',
 'Bipolar II disorder',
 'Mania',
 '(Mania) or (hypomania) or (agitated depression)',
 'Bipolar disorder',
 'Hypomanic personality disorder',
 'Other manic-depressive psychos',
 'Single manic episode in partial or unspecified remission',
 'Manic disorder single episode',
 'Manic behaviour',
 'Organic bipolar disorder',
 '[X]Manic-depress psychosisdepressdno psychotic symptoms',
 'Manic mood',
 '[X]Mania with psychotic symptoms',
 '[X]Manic episode unspecified',
 '[X]Other bipolar affective disorders',
 'Hypomanic mood',
 'Bipolar',
 'Bipolar diathermy',
 'Hypomanic behaviour',
 'Hypomania',
 'H/O: manic depressive disorder',
 'Bipolar I disorder']

In [109]:
#Creating a list of all SMIs that can be categorized as 'other smi'

other_smi_list = c_smis.ix[c_smis['other SMI'] == True]
other_smi_list = list(other_smi_list['SMI'].map(lambda x: re.sub(r'\t', '', x)))
other_smi_list 

['Borderline schizophrenia',
 'Schizoaffective disorder depressive type',
 'Schizoaffective disorder manic type',
 '[X]Schizotypal disorder',
 'Schizoaffective disorder mixed type',
 'Acute schizophrenia-like psychotic disorder',
 'Schizophreniform disorder']

In [110]:
#Creating a list of all SMIs that can be categorized as 'unknown/unspecified'

unknown_smi_list = c_smis.ix[c_smis['Unknown/unspecified'] == True]
unknown_smi_list = list(unknown_smi_list['SMI'].map(lambda x: re.sub(r'\t', '', x)))
unknown_smi_list 

['Affective psychoses:[manic depress] or [involut melancholia]']

In [111]:
df1

Unnamed: 0,PatientId,DateOfBirth,DateOfDeath,Gender,Age_at_Death,Date_Of_Diagnosis,SMI_Code,SMI_Description,Age_SMI_Diagnosed
0,41751,1943-09-01 00:00:00.000,,F,-,2000-10-20,X00SL,Hypomania,57
1,41751,1943-09-01 00:00:00.000,,F,-,2011-01-10,X00SM,Bipolar disorder,67
2,43655,1947-09-01 00:00:00.000,2009-06-01 00:00:00.000,M,61,2004-12-03,X00SL,Hypomania,57
3,44191,1950-01-01 00:00:00.000,2014-01-01 00:00:00.000,M,64,2012-06-07,X00SM,Bipolar disorder,62
4,50463,1961-08-01 00:00:00.000,,M,-,2004-07-29,X00SJ,Mania,43
5,50463,1961-08-01 00:00:00.000,,M,-,2008-02-27,XE1ZW,[X]Manic episode unspecified,46
6,50463,1961-08-01 00:00:00.000,,M,-,2008-06-25,X00SM,Bipolar disorder,46
7,66851,1962-02-01 00:00:00.000,,F,-,2010-09-27,X00SM,Bipolar disorder,48
8,88679,1962-11-01 00:00:00.000,,M,-,1982-01-01,XaIx7,H/O: manic depressive disorder,19
9,88679,1962-11-01 00:00:00.000,,M,-,2010-01-01,X00SL,Hypomania,47


In [112]:
#function to classify SMIs

def smi_category_sorting(SMI):
    if SMI in schizophrenia_list:
        return "Schizophrenia"
    elif SMI in bipolar_list:
        return "Bipolar"
    elif SMI in other_smi_list:
        return "Other SMI"
    elif SMI in unknown_smi_list:
        return "Unknown SMI"
    


In [113]:
#adding smi type to dataframe

df1['SMI_type'] = df1['SMI_Description'].apply(smi_category_sorting)
df1.head(101)

Unnamed: 0,PatientId,DateOfBirth,DateOfDeath,Gender,Age_at_Death,Date_Of_Diagnosis,SMI_Code,SMI_Description,Age_SMI_Diagnosed,SMI_type
0,41751,1943-09-01 00:00:00.000,,F,-,2000-10-20,X00SL,Hypomania,57,Bipolar
1,41751,1943-09-01 00:00:00.000,,F,-,2011-01-10,X00SM,Bipolar disorder,67,Bipolar
2,43655,1947-09-01 00:00:00.000,2009-06-01 00:00:00.000,M,61,2004-12-03,X00SL,Hypomania,57,Bipolar
3,44191,1950-01-01 00:00:00.000,2014-01-01 00:00:00.000,M,64,2012-06-07,X00SM,Bipolar disorder,62,Bipolar
4,50463,1961-08-01 00:00:00.000,,M,-,2004-07-29,X00SJ,Mania,43,Bipolar
5,50463,1961-08-01 00:00:00.000,,M,-,2008-02-27,XE1ZW,[X]Manic episode unspecified,46,Bipolar
6,50463,1961-08-01 00:00:00.000,,M,-,2008-06-25,X00SM,Bipolar disorder,46,Bipolar
7,66851,1962-02-01 00:00:00.000,,F,-,2010-09-27,X00SM,Bipolar disorder,48,Bipolar
8,88679,1962-11-01 00:00:00.000,,M,-,1982-01-01,XaIx7,H/O: manic depressive disorder,19,Bipolar
9,88679,1962-11-01 00:00:00.000,,M,-,2010-01-01,X00SL,Hypomania,47,Bipolar


In [114]:
#Keeping only necessay columns

df_type = df1.copy()
df_type = df_type[['PatientId','SMI_type', 'Date_Of_Diagnosis', 'Age_SMI_Diagnosed']]
df_type

Unnamed: 0,PatientId,SMI_type,Date_Of_Diagnosis,Age_SMI_Diagnosed
0,41751,Bipolar,2000-10-20,57
1,41751,Bipolar,2011-01-10,67
2,43655,Bipolar,2004-12-03,57
3,44191,Bipolar,2012-06-07,62
4,50463,Bipolar,2004-07-29,43
5,50463,Bipolar,2008-02-27,46
6,50463,Bipolar,2008-06-25,46
7,66851,Bipolar,2010-09-27,48
8,88679,Bipolar,1982-01-01,19
9,88679,Bipolar,2010-01-01,47


In [115]:
#Remove duplicates so that if 2 SMIs in same catagory diagnosed, only keep first diagnosis

dfx = df_type.copy().drop_duplicates(subset=['PatientId','SMI_type'], keep = 'first')
dfx

Unnamed: 0,PatientId,SMI_type,Date_Of_Diagnosis,Age_SMI_Diagnosed
0,41751,Bipolar,2000-10-20,57
2,43655,Bipolar,2004-12-03,57
3,44191,Bipolar,2012-06-07,62
4,50463,Bipolar,2004-07-29,43
7,66851,Bipolar,2010-09-27,48
8,88679,Bipolar,1982-01-01,19
10,98715,Bipolar,2012-01-06,41
13,124863,Bipolar,2009-07-06,73
14,125027,Other SMI,2009-10-19,31
15,128975,Other SMI,1991-11-22,53


In [116]:
dfx = dfx.reset_index(drop=True)

#Locating duplicated PatientId (T/F) and printing how many duplicates remain
dfx_p_dup = dfx["PatientId"].duplicated()
print(dfx_p_dup.sum())
    
#Creating a list of indices for non-duplicated PatientIds
false_index_list_x = dfx_p_dup[dfx_p_dup == False].index.tolist()
false_index_list_x
    
#Creating a new data frame of only non-duplicated values
dfxa = dfx.iloc[false_index_list_x]

#Remove non-repeated values from original df 
dfx2 = dfx.drop(dfx.index[[false_index_list_x]])
dfxa

308


Unnamed: 0,PatientId,SMI_type,Date_Of_Diagnosis,Age_SMI_Diagnosed
0,41751,Bipolar,2000-10-20,57
1,43655,Bipolar,2004-12-03,57
2,44191,Bipolar,2012-06-07,62
3,50463,Bipolar,2004-07-29,43
4,66851,Bipolar,2010-09-27,48
5,88679,Bipolar,1982-01-01,19
6,98715,Bipolar,2012-01-06,41
7,124863,Bipolar,2009-07-06,73
8,125027,Other SMI,2009-10-19,31
9,128975,Other SMI,1991-11-22,53


In [117]:
#Repeat 1 

dfx2 = dfx2.reset_index(drop=True)
dfx2_p_dup = dfx2["PatientId"].duplicated()
print(dfx2_p_dup.sum())
false_index_list_x2 = dfx2[dfx2_p_dup == False].index.tolist()
dfxb = dfx2.iloc[false_index_list_x2] 
#Rename dfb columns
#dfb = dfb.rename(columns={'CTV3Code_1': 'CTV3Code_2', 'CTV3TermText_1': 'CTV3TermText_2'})
dfx3 = dfx2.drop(dfx2.index[[false_index_list_x2]])
dfxb.rename(columns={'SMI_type': 'SMI_type_2','Date_Of_Diagnosis': 'Date_Of_Diagnosis_2', 'Age_SMI_Diagnosed': 'Age_SMI_Diagnosed_2'}, inplace=True)
dfxb

13


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


Unnamed: 0,PatientId,SMI_type_2,Date_Of_Diagnosis_2,Age_SMI_Diagnosed_2
0,128975,Bipolar,1994-04-16,55
1,181911,Bipolar,2001-10-19,32
2,217463,Other SMI,1900-01-01,-9
3,380723,Other SMI,1962-09-12,29
4,2013135,Other SMI,1985-12-22,36
5,2046627,Schizophrenia,2004-02-27,29
6,2099471,Bipolar,2007-01-26,53
7,2116543,Other SMI,2012-01-10,37
8,2123107,Other SMI,2009-09-18,48
9,2295183,Bipolar,2008-12-18,72


In [118]:
#Merge dfa and dfb

df_merge_x = dfxa.merge(dfxb, on =["PatientId"],how='left')
df_merge_x

Unnamed: 0,PatientId,SMI_type,Date_Of_Diagnosis,Age_SMI_Diagnosed,SMI_type_2,Date_Of_Diagnosis_2,Age_SMI_Diagnosed_2
0,41751,Bipolar,2000-10-20,57,,,
1,43655,Bipolar,2004-12-03,57,,,
2,44191,Bipolar,2012-06-07,62,,,
3,50463,Bipolar,2004-07-29,43,,,
4,66851,Bipolar,2010-09-27,48,,,
5,88679,Bipolar,1982-01-01,19,,,
6,98715,Bipolar,2012-01-06,41,,,
7,124863,Bipolar,2009-07-06,73,,,
8,125027,Other SMI,2009-10-19,31,,,
9,128975,Other SMI,1991-11-22,53,Bipolar,1994-04-16,55.0


In [119]:
#Repeat 2 (Keep repeating until N =0)

dfx3 = dfx3.reset_index(drop=True)
dfx3_p_dup = dfx3["PatientId"].duplicated()
print('N:', dfx3_p_dup.sum())
false_index_list_x3 = dfx3[dfx3_p_dup == False].index.tolist()
dfxc = dfx3.iloc[false_index_list_x3] 
#Rename dfb columns
#dfc = dfc.rename(columns={'CTV3Code_1': 'CTV3Code_3', 'CTV3TermText_1': 'CTV3TermText_3'})
dfx4 = dfx3.drop(dfx3.index[[false_index_list_x3]])
dfxc.rename(columns={'SMI_type': 'SMI_type_3','Date_Of_Diagnosis': 'Date_Of_Diagnosis_3', 'Age_SMI_Diagnosed': 'Age_SMI_Diagnosed_3'}, inplace=True)
dfxc

N: 0


Unnamed: 0,PatientId,SMI_type_3,Date_Of_Diagnosis_3,Age_SMI_Diagnosed_3
0,2629803,Unknown SMI,1991-07-09,48
1,3205683,Schizophrenia,2000-05-22,31
2,3217291,Schizophrenia,1997-07-22,31
3,3234487,Bipolar,1997-09-08,74
4,4218819,Unknown SMI,2000-02-22,47
5,5568051,Other SMI,2009-06-02,39
6,20336779,Bipolar,2004-12-17,56
7,39246735,Unknown SMI,1998-12-29,55
8,41302211,Bipolar,2012-03-12,47
9,54616971,Bipolar,2001-05-16,58


In [120]:
#Merge 

df_merge_x2 = df_merge_x.merge(dfxc, on =["PatientId"], how ='left')

df_merge_x2

Unnamed: 0,PatientId,SMI_type,Date_Of_Diagnosis,Age_SMI_Diagnosed,SMI_type_2,Date_Of_Diagnosis_2,Age_SMI_Diagnosed_2,SMI_type_3,Date_Of_Diagnosis_3,Age_SMI_Diagnosed_3
0,41751,Bipolar,2000-10-20,57,,,,,,
1,43655,Bipolar,2004-12-03,57,,,,,,
2,44191,Bipolar,2012-06-07,62,,,,,,
3,50463,Bipolar,2004-07-29,43,,,,,,
4,66851,Bipolar,2010-09-27,48,,,,,,
5,88679,Bipolar,1982-01-01,19,,,,,,
6,98715,Bipolar,2012-01-06,41,,,,,,
7,124863,Bipolar,2009-07-06,73,,,,,,
8,125027,Other SMI,2009-10-19,31,,,,,,
9,128975,Other SMI,1991-11-22,53,Bipolar,1994-04-16,55.0,,,


In [121]:
#Export 'df_merge_x2' as Excel File (CSV)

df_merge_x2.to_csv('Part2_SMI_data_SMIs_Grouped.csv', sep=',')