1) Loads in 'SMI_uncombined_filtered.csv' and uses it to create a list of when each patient was first diagnosed with an SMI.

2) Loads in 'AHSN_Final_Taylor_Additional_Coded_Events_Total2.txt', containing medical records, and filters for BMI readings. Then filters are applied so that a dataframe is left which contains only BMI readings made after SMI diagnosis (df_working).

3) Creates a table ('Part3_bmi_summary_data.csv') containing BMI summary data, including BMI after diagnosis, latest BMI, BMI change per year and mean BMI (all readings after diagnosis).

4) Creates Excel File showing how many patients have BMI records in each year ('Part3_BMIRecordsEachYear.csv')
 
4) Works out annual BMIs since diagnosis 'Part3_Annual_BMI_record.csv'.

In [None]:
import os
os.listdir()

In [None]:
import pandas as pd
import numpy as np
import datetime 

# Creating a df of BMI info after SMI diagnosis

In [None]:
#Load in 'combined_data' 

df = pd.read_csv('SMI_uncombined_filtered.csv')
df


In [None]:
#Making a list of when each patient was first diagnosed with an SMI and writing it to an Excel File ('first_diagnosis.csv')

first_SMI_diagnosed = df[["PatientId","Date_Of_Diagnosis"]]
first_SMI_diagnosed = first_SMI_diagnosed.rename(columns={'Date_Of_Diagnosis': '1st_SMI_Diagnosis'})
first_SMI_diagnosed = first_SMI_diagnosed.sort_values(by=["PatientId","1st_SMI_Diagnosis",])
first_SMI_diagnosed = first_SMI_diagnosed.drop_duplicates(subset=['PatientId'], keep = 'last')
first_SMI_diagnosed.to_csv('first_diagnosis.csv', sep=',')

In [None]:
#Importing the csv file that contains info on BMI (ISO-8859-1 used due to presence of 'special' characters in file)

df1 = pd.read_csv('AHSN_Final_Taylor_Additional_Coded_Events_Total2.txt', sep ='|', encoding = 'ISO-8859-1')
df1.head()

In [None]:
#Keep only necessary columns

df1 = df1[['PatientId', 'EventDate', 'CTV3TermText', 'NumberValue']]

df1.head()

In [None]:
#Make a list of rows containing BMI observations that are not 0

BMI_index_list = df1[df1['CTV3TermText'] == 'Body mass index - observation']

BMI_index_list = BMI_index_list[BMI_index_list['NumberValue'] != 0].index.tolist()

#Print length of 'BMI_index_list'

len(BMI_index_list)

In [None]:
BMI_filtered = df1.iloc[BMI_index_list]
BMI_filtered

In [None]:
#Merge together 'first_SMI_diagnosed' df with 'BMI_filtered' df

merged = first_SMI_diagnosed.merge(BMI_filtered,on =["PatientId"])
merged

In [None]:
#Convert dates to same datetime objects

merged['EventDate'] = pd.to_datetime(merged['EventDate'])
merged['1st_SMI_Diagnosis'] = pd.to_datetime(merged['1st_SMI_Diagnosis'])

merged

In [None]:
#Reveal rows where the appointment (Eventdate) was before (or same as) SMI diagnosis

BMI_after_diagnosis = merged['EventDate']>=merged['1st_SMI_Diagnosis']
BMI_after_diagnosis.head()

In [None]:
#Make a list of rows where the appointment comes after the SMI diagnosis

BMI_after_diagnosis_list = merged[BMI_after_diagnosis == True].index.tolist()
len(BMI_after_diagnosis_list)

In [None]:
#Create new 'working df' containing only the desired appointments (i.e. after SMI diagnosis)

df_working = merged.iloc[BMI_after_diagnosis_list] 

df_working

In [None]:
#Sort df so that it is ordered by 'PatientId' followed by 'EventDate'

df_working = df_working.sort_values(by=["PatientId","EventDate",])
df_working = df_working.reset_index(drop=True)
df_working

# Creating a BMI summary table

In [None]:
#Calculating mean_bmi

mean_bmi = df_working.groupby('PatientId').mean()
mean_bmi = mean_bmi.rename(columns={'NumberValue': 'Mean_BMI'})

mean_bmi['PatientId'] = mean_bmi.index

mean_bmi


In [None]:
#Take BMI reading from first appointment after SMI diagnosis

df_first = df_working.drop_duplicates(subset=['PatientId'], keep = 'first')
df_first = df_first.rename(columns={'NumberValue': 'Baseline_BMI', 'EventDate': 'BMI_Date'})
df_first = df_first.drop("CTV3TermText",1)
df_first = df_first[['PatientId', '1st_SMI_Diagnosis', 'Baseline_BMI', 'BMI_Date']]
df_first

In [None]:
#Take BMI reading from most recent appointment

df_latest = df_working.drop_duplicates(subset=['PatientId'], keep = 'last')
df_latest = df_latest.rename(columns={'NumberValue': 'Most_Recent_BMI', 'EventDate': 'BMI_Date'})
df_latest = df_latest.drop("CTV3TermText",1)
df_latest = df_latest[['PatientId', '1st_SMI_Diagnosis', 'Most_Recent_BMI', 'BMI_Date']]
df_latest

In [None]:
#Merge 'df_first' and 'df_latest'

df_bmi_merge = df_first.merge(df_latest, on =["PatientId", "1st_SMI_Diagnosis"], how = 'left')

#Rename columns
df_bmi_merge = df_bmi_merge.rename(columns={'BMI_Date_x': 'Baseline_BMI_Date', 'BMI_Date_y': 'MostRecent_BMI_Date'})
df_bmi_merge

Adding a column for years between 1st and latest diagnosis

In [None]:

bmi_years_copy = df_bmi_merge.copy()

#Convert both dates to a year and then subtract (don't include zero year values)
bmi_years_copy['Baseline_BMI_Date'] = bmi_years_copy['Baseline_BMI_Date'].dt.year
bmi_years_copy['MostRecent_BMI_Date'] = bmi_years_copy['MostRecent_BMI_Date'].dt.year
bmi_years_copy['Years_Difference'] = (bmi_years_copy['MostRecent_BMI_Date'] - bmi_years_copy['Baseline_BMI_Date'])
bmi_years_copy['Years_Difference'] = bmi_years_copy['Years_Difference'][bmi_years_copy['Years_Difference']!=0]

#Adding a column for change in BMI between 1st and latest diagnosis
bmi_years_copy['BMI_difference'] = (bmi_years_copy['Most_Recent_BMI'] - bmi_years_copy['Baseline_BMI'])

#Change in BMI/year
bmi_years_copy['BMI_change_PerYear'] = (bmi_years_copy['BMI_difference'] / bmi_years_copy['Years_Difference'])

bmi_change = bmi_years_copy.copy()
bmi_change

In [None]:
bmi_change = bmi_change[['PatientId','BMI_change_PerYear']]

df_bmi = df_bmi_merge.merge(bmi_change, on =["PatientId"], how = 'left')

df_bmi

In [None]:
#Checking that zeros removed

sum(df_bmi['Baseline_BMI'] == 0)

In [None]:
#Checking that zeros removed

sum(df_bmi['Most_Recent_BMI'] == 0)

In [None]:
#Merge 'df_bmi_merge' and 'mean_bmi'

bmi_merge = df_bmi.merge(mean_bmi, on =["PatientId"], how = 'left')
bmi_merge

In [None]:
#Export 'bmi_merge' as Excel File ('Part3_bmi_summary_data.csv')

from pandas import ExcelWriter

bmi_merge.to_csv('Part3_bmi_summary_data.csv', sep=',')

# Working out annual BMIs since diagnosis

In [None]:
#Create copy of df_working
df_every_year = df_working.copy()

#Convert full dates to year only
df_every_year['EventDate'] = df_every_year['EventDate'].dt.year
df_every_year['1st_SMI_Diagnosis'] = df_every_year['1st_SMI_Diagnosis'].dt.year

In [None]:
#Drop duplicate years, keeping only the first reading each year
df_every_year = df_every_year.drop_duplicates(subset=['PatientId', 'EventDate'], keep = 'first')

#Remove column
df_every_year = df_every_year.drop("CTV3TermText",1)
df_every_year

In [None]:
#Adding 'years after diagnosis' column

df_every_year['Yrs_after_diagnosis'] = df_every_year['EventDate'] - df_every_year['1st_SMI_Diagnosis']
df_every_year = df_every_year[df_every_year['EventDate'] != 2087]
df_every_year = df_every_year.drop_duplicates(subset=['PatientId', 'EventDate'], keep = 'first')
df_every_year

In [None]:
#calculating how many patients have BMI records for each year

summary_table = df_every_year.groupby('EventDate').count()
list(summary_table)
summary_table = summary_table['PatientId']
summary_table

In [None]:
#Export summary_table to Excel ('Part3_BMIRecordsEachYear.csv')

summary_table.to_csv('Part3_BMIRecordsEachYear.csv', sep=',')

Pivoting Table to show BMI history for each patient

In [None]:
#Prepare data frame for pivoting

df_bmi_pivprep = df_every_year[['PatientId', 'Yrs_after_diagnosis', 'NumberValue']]
df_bmi_pivprep.rename(columns={'NumberValue': 'BMI', 'Yrs_after_diagnosis': 'Number of years after diagnosis'}, inplace=True)
df_bmi_pivprep

In [None]:
BMI_pivoted = df_bmi_pivprep.pivot(index='PatientId', columns='Number of years after diagnosis').fillna('-')
BMI_pivoted

In [None]:
#Export 'BMI_pivoted' to Excel File ('Part3_Annual_BMI_record.csv')

BMI_pivoted.to_csv('Part3_Annual_BMI_record.csv', sep=',')