1) Combines data from 'Patient_Total.txt', 'AHSN_Final_Taylor_Registration_History_Total.txt, 'AddressHistory_Total.txt' and 'Ethnicity_Total.txt' into a merged dataframe for export to CSV ('Part1_basic_patient_info.csv').

2) Creates a new CSV file ('uncombined_SMI.csv') to be used by 'Part2_SMIs.ipynb' - importanly, this csv contains a new column with the 'age of diagnosis' for each SMI.

In [None]:
import os
os.listdir()

In [None]:
import pandas as pd
import numpy as np
import re

# DF: Basic info

In [None]:
#Read-in file

df = pd.read_csv("AHSN_Final_Taylor_Patient_Total.txt", sep ='|') #RENAME FILE
print(df.count())
df.head(10)


Number of unique patients in "AHSN_Final_Taylor_Patient_Total.txt"

In [None]:
len(set(df["PatientId"]))

In [None]:
DOD = list(df["DateOfDeath"])
DOB = list(df["DateOfBirth"])

Adding age of death

In [None]:
#Create list with age at which patients have died

import datetime as dt
from dateutil import parser

list_age_death = []


for f, b in zip(DOD, DOB):
    try:
        days_at_death = parser.parse(f) - parser.parse(b)
        years_at_death = int((days_at_death.days)/365)
        list_age_death.append(years_at_death)
    except:
        list_age_death.append("-")
        continue
   
        
    
print(len(list_age_death))
#age_at_death = parser.parse(DOD) - parser.parse(DOB)



In [None]:
#Add 'Age_at_death' column

df["Age_at_Death"]=list_age_death

df.head(11)

In [None]:
#Read-in 'practice ID' and 'registration date'

#Read-in file
reg_info = pd.read_csv('AHSN_Final_Taylor_Registration_History_Total.txt', sep ='|')

#Select columns
reg_info = reg_info[['PatientId', 'PracticeId', 'RegistrationDate']]

#Sort by 'PatientId', then 'Registration date'
reg_info = reg_info.sort_values(by=["PatientId","RegistrationDate",])

#Keep only the most recent record for each patient
reg_info = reg_info.drop_duplicates(subset=['PatientId'], keep = 'last')
reg_info

# DF2: SMI info

In [None]:
#Read in "AHSN_Final_Taylor_Diagnostic_Codes_Total.txt",

df2 = pd.read_csv("AHSN_Final_Taylor_Diagnostic_Codes_Total.txt", sep ='|') #RENAME FILE

#Drop 'EventId
df2 = df2.drop("EventId",1)

#Rename columns
df2.rename(columns={'EventDate': 'Date_Of_Diagnosis', 'CTV3TermText\t\t': 'CT3TermText'}, inplace=True)

#Print the total number of SMI records
print(df2.count())

#Print dataframe(first 11 rows)
df2.head(11)

Unique patients

In [None]:
#Print number of unique patients

len(set(df2["PatientId"]))

Removing SMI duplicates (use only first date of diagnosis) - except where 1st date is 1900, where use 2nd.

In [None]:
#Converting 1900s to NaNs in Data Frame

def nines_to_nans(date):
    if date == "1900-01-01":
        return "NaN"
    else:
        return date

df2["Date_Of_Diagnosis"] = df2["Date_Of_Diagnosis"].apply(nines_to_nans)

df2

In [None]:
#Sort dataframe by PatientId, then Date_Of_Diagnosis, then CTV3Code

#Remove duplicates, based on PatientId and CTV3Code - keep only the most recent StartDate

df2 = df2.sort_values(by=['PatientId','Date_Of_Diagnosis', 'CTV3Code'], na_position = 'last').drop_duplicates(subset=['PatientId', 'CTV3Code'], keep = 'first')

df2

In [None]:
#Check there are still the same number of unique patients

print("Unique patients: ", len(set(df2["PatientId"])))

In [None]:
#Now data frame reformatted, convert NaNs back to 1900

def nans_to_nines(date):
    if date == "NaN":
        return "1900-01-01" 
    else:
        return date

df2["Date_Of_Diagnosis"] = df2["Date_Of_Diagnosis"].apply(nans_to_nines)

df2

List and count of all "SMIs" and export to CSV file

In [None]:
unique_smis = set(df2["CT3TermText"])

unique_smis = list(unique_smis)

unique_smis_df = pd.DataFrame(unique_smis)

from pandas import ExcelWriter

unique_smis_df.to_csv('unique_smis.csv', sep=',')

Merging dataframes to add 'age of diagnosis

In [None]:
#Merge 'basic info' with 'SMI info'

df_merge_1 = df.merge(df2, on =["PatientId"])

#Print total number of patient records
print(df_merge_1.count())

#Print first 20 rows
df_merge_1.head(20)

Unique patients

In [None]:
len(set(df_merge_1["PatientId"]))

Adding age at which diagnosed

In [None]:
#Caculating 'age at diagnosis' and constructing a list
list_age_diagnosis = []

date_diagnosed = list(df_merge_1['Date_Of_Diagnosis'])
date_born = list(df_merge_1['DateOfBirth'])

for f, b in zip(date_diagnosed, date_born):
    age_diagnosed_days = parser.parse(f) - parser.parse(b)
    age_diagnosed_years = int((age_diagnosed_days.days)/365)
    list_age_diagnosis.append(age_diagnosed_years)
   


In [None]:
#Adding "Age_SMI_Diagnosed" to Data Frame

df_merge_1["Age_SMI_Diagnosed"]=list_age_diagnosis
df_merge_1.head(25)

In [None]:
#Converting ages below 1 (due to presence of 1900 diagnosis dates) in 'Age_SMI_Diagnosed' to '-9'

def minus_9(age):
    if age < 1:
        return int("-9")
    else: 
        return age

df_merge_1["Age_SMI_Diagnosed"] = df_merge_1["Age_SMI_Diagnosed"].apply(minus_9)

df_merge_1.head(25)

In [None]:
#Create Excel File (CSV) from df_merge_1

from pandas import ExcelWriter

df_merge_1.to_csv('uncombined_SMI', sep=',')

# DF3: IMD Rank

In [None]:
#Load in "AHSN_Final_Taylor_AddressHistory_Total.txt" and print length

df3 = pd.read_csv("AHSN_Final_Taylor_AddressHistory_Total.txt", sep ='|') #RENAME FILE
print(df3.count())
print("Length_1:", len(set(df3["PatientId"])))

#Remove patients with no IMDRank
df3 = df3.dropna(subset=["IMDRank"])

#Sort dataframe by PatientId and then StartDate
df3 = df3.sort_values(by=["PatientId","StartDate"], na_position = 'first')
print("Length_2:", len(set(df3["PatientId"])))

#Remove duplicates, based on PatientId - keep only the most recent StartDate
df3 = df3.drop_duplicates('PatientId', keep = 'last')
print("Length_3:", len(set(df3["PatientId"])))

df3


Unique patients

In [None]:
len(set(df3["PatientId"]))

# DF4

In [None]:
df4 = pd.read_csv("AHSN_Final_Taylor_Ethnicity_Total.txt", sep ='|') 
print(df4.count())
df4.head(51)

In [None]:
#Rename column

df4.rename(columns={'CTV3TermText\t\t': 'Ethnic_origin'}, inplace=True)
df4

In [None]:
#Removing "Ethnic groups (census) NOS\t\t"

df_t = df4[df4.Ethnic_origin != "Ethnic groups (census) NOS\t\t"]

#Removing duplicated entries ('patientId'), keeping only most recent entries

df_t = df_t.drop_duplicates('PatientId', keep = 'last')

#Dropping 'EventDate' column

df_t = df_t.drop("EventDate",1)

print(df_t.count())
print("Unique:", len(set(df_t["PatientId"])))






In [None]:
#Remove unwanted characters

df_t['Ethnic_origin'] = df_t['Ethnic_origin'].map(lambda x: re.sub(r'\t', '', x))
df_t

Add column with 'categorised' ethnicities

In [None]:
#Read in file where ethnicities have been categorised

ethnicity_cat = pd.read_csv('categorised_ethnicities_1.csv')
ethnicity_cat

#Remove unwanted characters 
ethnicity_cat['ResearchOne categories'] = ethnicity_cat['ResearchOne categories'].map(lambda x: re.sub(r'\t', '', x))
ethnicity_cat

#Create list of ethnicities for Cat1 - White
white = ethnicity_cat.ix[ethnicity_cat['ONS cats'] == 1]
white_list = list(white['ResearchOne categories'])

#Create list of ethnicities for Cat2 - Mixed
mixed = ethnicity_cat.ix[ethnicity_cat['ONS cats'] == 2]
mixed_list = list(mixed['ResearchOne categories'])

#Create list of ethnicities for Cat3 - Asian
asian = ethnicity_cat.ix[ethnicity_cat['ONS cats'] == 3]
asian_list = list(asian['ResearchOne categories'])

#Create list of ethnicities for Cat4 - Black
black = ethnicity_cat.ix[ethnicity_cat['ONS cats'] == 4]
black_list = list(black['ResearchOne categories'])

#Create list of ethnicities for Cat5 - Other
other = ethnicity_cat.ix[ethnicity_cat['ONS cats'] == 5]
other_list = list(other['ResearchOne categories'])

#Create list of ethnicities for Cat6 - Unspecified/unkown
unkown = ethnicity_cat.ix[ethnicity_cat['ONS cats'] == 6]
unkown_list = list(unkown['ResearchOne categories'])

#Create function to sort ethnicities (giving ONS cat number)

def ethnicity_category_sorting_num(x):
    if x in white_list:
        return "1"
    elif x in mixed_list:
        return "2"
    elif x in asian_list:
        return "3"
    elif x in black_list:
        return "4"
    elif x in other_list:
        return "5"
    elif x in unkown_list:
        return "6"
    
#Create function to sort ethnicities (giving description)

def ethnicity_category_sorting_descr(x):
    if x in white_list:
        return "White"
    elif x in mixed_list:
        return "Mixed/Multiple ethnic groups"
    elif x in asian_list:
        return "Asian/Asian British"
    elif x in black_list:
        return "Black/ African/Caribbean/Black British"
    elif x in other_list:
        return "Other ethnic group"
    elif x in unkown_list:
        return "Not specified / not given"



In [None]:
#Call functions to add 2 new rows

df_t['Ethnicity_Grouped_ONS-Code'] = df_t['Ethnic_origin'].apply(ethnicity_category_sorting_num)
df_t['Ethnicity_Grouped_ONS-Description'] = df_t['Ethnic_origin'].apply(ethnicity_category_sorting_descr)
df_t


In [None]:
#Keep only 'Grouped Etnicity' column

df_tt = df_t[['PatientId', 'Ethnicity_Grouped_ONS-Description']]
df_tt

# Merging DFs

In [None]:
#Merge 'Basic Info' (df) with 'Ethnicity Info' (df_tt)

m_df = df.merge(df_tt, on =["PatientId"], how = 'left')
m_df

In [None]:
#Merge 'above data frame' with 'IMD rank Info'

m_df1 = m_df.merge(df3, on =["PatientId"], how = 'left')
m_df1

In [None]:
#Merge 'above data frame' with 'Practice registration Info'

m_df2 = m_df1.merge(reg_info, on =["PatientId"], how = 'left')
m_df2



In [None]:
#Print number of Patients in 'm_df2'

len(set(m_df2['PatientId']))

In [None]:
#Modifying so that only patients with SMI included

smi_patients = pd.read_csv('unique_smi_PatientList.csv')
smi_patients = smi_patients.drop("Unnamed: 0",1)
smi_patients = smi_patients.drop("Age_SMI_Diagnosed",1)
smi_patients

In [None]:
basic_info = smi_patients.merge(m_df2, on =["PatientId"], how = 'left')
basic_info

# Exporting dataframe to CSV

In [None]:
from pandas import ExcelWriter

basic_info.to_csv('Part1_basic_patient_info.csv', sep=',')

Total number of patient records

In [None]:
patient_ids = basic_info["PatientId"]

print(patient_ids.count())

Number of unique patients

In [None]:
set_patient_ids = set(patient_ids)

print(len(set_patient_ids))