In [None]:
import fileinput
import glob
import pandas as pd
import numpy as np
import os
from datetime import datetime
from datetime import timedelta
import dask.dataframe as dd

### Load in data

In [None]:
# load in outcome data
DIR = "cleaned_files"
os.chdir(DIR)
outcome =  pd.read_csv("outcomes.csv")
compare = outcome.e_patid

# load in smoking data
DIR = "merged_files"
os.chdir(DIR)

fields = ["e_patid", "term", "measuredate"]
smoking = pd.read_csv("smoking.csv", sep = "\t", usecols = fields)

# keep subset of smoking that has patient IDs in outcome 
smoking = smoking[smoking['e_patid'].isin(outcome['e_patid'])]

DIR = "cleaned_files"
os.chdir(DIR)
smoking.to_csv("smoking.csv", index = False)

### Clean smoking data to get smoking stats (current, non, former smoker)

In [None]:
# check what kind of dates in smoking are invalid for datetime
# it is all nan's and '01jan3344' --> go ahead and coerce because cannot correct that
not_datetime_smoking = pd.to_datetime(smoking["measuredate"], format = "%d%b%Y", errors = 'coerce').isna()
not_datetime_smoking = smoking.measuredate[not_datetime_smoking].unique()

# convert dates to valid format
smoking["measuredate"] = pd.to_datetime(smoking["measuredate"], format = "%d%b%Y", errors = 'coerce')

# sort values accordingly
smoking = smoking.sort_values(["e_patid", "measuredate"])

# find the unique smoking terms
see_unique_smoking = smoking.term.unique()
term_counts_smoking = smoking.groupby('term').size().reset_index(name = 'counts')
term_counts_smoking = term_counts_smoking.sort_values(by = 'counts', ascending = False)

# restrict the unique smoking terms to ones I can convert to current, former, non-smoker
most_freq_smoking_terms = term_counts_smoking.drop(index = [69, 0, 64, 1, 63, 72, 24, 70, 20, 68, 77, 27, 4, 78, 22, 79])

conversion_rates_smoking = {'Ex-smoker' : 'Former smoker',
'Smoking cessation advice' : 'Current smoker',
'Current non-smoker' : 'Non-smoker',
'Cigarette smoker' : 'Smoker',
'Current smoker' : 'Smoker',
'Stopped smoking' : 'Former smoker',
'Ex-cigarette smoker' : 'Former smoker',
'Ex-smoker - amount unknown' : 'Former smoker',
'Ex-heavy cigarette smoker (20-39/day)' : 'Former smoker',
'Non-smoker' : 'Non-smoker',
'Date ceased smoking' : 'Former smoker',
'Seen by smoking cessation advisor' : 'Current smoker',
'Rolls own cigarettes' : 'Current smoker',
'Trying to give up smoking' : 'Current smoker',
'Ex-very heavy cigarette smoker (40+/day)' : 'Former smoker',
'Smoker' : 'Current smoker',
'Cigar smoker' : 'Current smoker',
'Pipe smoker' : 'Current smoker',
'Referral to smoking cessation advisor' : 'Current smoker',
'Referral to stop-smoking clinic' : 'Current smoker',
'Ex-pipe smoker' : 'Former smoker',
'Negotiated date for cessation of smoking' : 'Current smoker',
'Smoking cessation advice declined' : 'Current smoker',
'Smoking cessation milestones' : 'Former smoker',
'Ex-cigar smoker' : 'Former smoker',
'Smoking cessation therapy' : 'Former smoker',
'Nicotine replacement therapy' : 'Former smoker',
'Not interested in stopping smoking' : 'Current smoker',
'Ready to stop smoking' : 'Current smoker',
'Smoking started' : 'Current smoker',
'Occasional smoker' : 'Current smoker',
'Tobacco dependence' : 'Current smoker',
'Smoking cessation programme start date' : 'Former smoker',
'Thinking about stopping smoking' : 'Current smoker',
'Smoker - amount smoked' : 'Current smoker',
##### 69 Smoking status at 4 weeks 
'Referral to National Health Service stop smoking service' : 'Current smoker',
'Referral for smoking cessation service offered' : 'Current smoker',
##### 0 Brief intervention for smoking cessation : 
'Keeps trying to stop smoking' : 'Current smoker',
'Ex roll-up cigarette smoker' : 'Former smoker',
'Smoking restarted' : 'Current smoker',
##### 64 Smoking free weeks 
'Recently stopped smoking' : 'Former smoker',
'Smoking cessation drug therapy' : 'Former smoker',
'Non-smoker annual review' : 'Non-smoker',
#### 71 Smoking status between 4 and 52 weeks 
'Smoking reduced' : 'Current smoker',
'Nicotine replacement therapy using nicotine patch' : 'Former smoker',
### 1 Carbon monoxide reading at 4 weeks 
### 63 Smoking cessation therapy NOS 
'Ex-smoker annual review' : 'Former smoker',
'Stop smoking service opportunity signposted' : 'Current smoker',
'Nicotine replacement therapy provided free' : 'Former smoker',
'Failed attempt to stop smoking' : 'Current smoker',
### 72 Stop smoking face to face follow-up 
### 24 Lost to smoking cessation follow-up 
#### 70 Smoking status at 52 weeks 
'Practice based smoking cessation programme start date' : 'Former smoker',
'Non-smoker annual review - enhanced services administration' : 'Non-smoker',
'Nicotine replacement therapy using nicotine inhalator' : 'Former smoker',
'Nicotine withdrawal' : 'Current smoker',
'Other specified smoking cessation therapy' : 'Former smoker',
'Smoking cessation advice provided by community pharmacist' : 'Current smoker',
'Over the counter nicotine replacement therapy' : 'Former smoker',
'Tobacco user' : 'Current smoker',
'Nicotine replacement therapy using nicotine gum' : 'Former smoker',
### 20 Fagerstrom Test for Nicotine Dependence total score
'Nicotine replacement therapy using nicotine lozenge' : 'Former smoker',
'Current smoker annual review' : 'Current smoker',
### 68 Smoking status at 12 weeks 
'Reason for restarting smoking' : 'Current smoker',
### 77 Tobacco dependence NOS 
'Smoking cessation 12 week follow-up' : 'Former smoker',
### 27 Nicotine replacement therapy contraindicated 
'Ex-smoker annual review - enhanced services administration' : 'Former smoker',
### 4 Counselling about tobacco use 
'Nicotine replacement therapy provided by community pharmacist' : 'Former smoker',
'Current smoker annual review - enhanced services admin' : 'Current smoker',
#### 78 Tobacco dependence, continuous 
#### 22 History of tobacco use 
#### 79 Tobacco dependence, unspecified
}

common_terms_smoking = smoking['term'].isin(most_freq_smoking_terms['term'])

smoking.loc[common_terms_smoking, 'term'] = smoking.loc[common_terms_smoking, 'term'].map(conversion_rates_smoking)
smoking.loc[~common_terms_smoking, 'term'] = np.nan

conditions = [
    smoking['term'] == 'Non-smoker',
    smoking['term'] == 'Current smoker',
    smoking['term'] == 'Former smoker'
]
values = [0, 1, 2]

smoking['smoking_status'] = np.select(conditions, values, default = np.nan)

DIR = "cleaned_files"
os.chdir(DIR)

smoking.to_csv('smoking.csv', index = False)

### Sort duplicates of patient IDs

In [None]:
# keep only relevant rows of both data frames
smoking = smoking[["e_patid", "term", "smoking_status", "measuredate"]]

# Merge the data frames on patient id
merged = pd.merge(outcome, smoking, on = "e_patid")

# Convert the date columns to datetime objects
merged["obsdate"] = pd.to_datetime(merged["obsdate"])
merged["measuredate"] = pd.to_datetime(merged["measuredate"], errors = "coerce")

# Sort the data by patient id and date of smoking measurement
merged = merged.sort_values(["e_patid", "measuredate"])

### Try different fixed time windows to assess % missingness

In [None]:
DIR = 'cleaned_files'
os.chdir(DIR)

# Sort by patient ID and measurement date
merged = merged.sort_values(['e_patid', 'measuredate'])

# Create empty dictionary to store data frames for different time windows
dfs = {}

# Loop over different time windows
for time_window in [1, 2, 3, 4, 5, 10]:

    # Create a copy of the data frame for the current time window
    merged_window = merged.copy()

     # Create a start date for the time window range (use obsdate as the end)
    merged_window['date_start'] = pd.to_datetime(merged_window['obsdate']) - pd.DateOffset(years=time_window)

    # Ensure the other dates are in valid datetime format
    merged_window['measuredate'] = pd.to_datetime(merged_window['measuredate'], errors = 'coerce') 
    merged_window['obsdate'] = pd.to_datetime(merged_window['obsdate'])

    # Get T/F array of values that are in the timeframe (T) and that are not (F)
    is_between_dates = (merged_window['measuredate'] >= merged_window['date_start']) & (merged_window['measuredate'] <= merged_window['obsdate'])

    # Set values outside this valid window to NaN (because they are missing if they are not in the window!)
    merged_window.loc[~is_between_dates, 'smoking_status'] = np.nan
    
    # Drop duplicates
    merged_window = merged_window.sort_values(['e_patid', 'measuredate'])
    merged_window['days_diff'] = abs(merged_window['measuredate'] - merged_window['obsdate']).dt.days
    merged_window = merged_window.sort_values(['e_patid', 'obsdate', 'days_diff'], ascending=[True, True, True])
    merged_window = merged_window.drop_duplicates(subset=['e_patid'], keep='first')
    
    # Only keep relevant columns
    merged_window = merged_window[["e_patid", "term", "smoking_status", "measuredate"]]
    merged_outcome = pd.merge(outcome, merged_window, on = 'e_patid', how = 'left')

    # Save data frame to dictionary
    dfs[f"{time_window}yr"] = merged_outcome[['e_patid', 'term', 'obsdate', 'smoking_status']]

# Save data frames to CSV files in separate folders
for folder_name, df in dfs.items():
    df.to_csv(os.path.join(folder_name, 'smoking_no_duplicates_merged.csv'), index=False)

### Now try closest measurement to index date to assess % missingness

In [None]:
# try to see what closest measuredate to obsdate gets us in terms of missing values 

merged = merged.sort_values(['e_patid', 'measuredate'])

merged_window = merged.copy()

# Get T/F array of values that are in the timeframe (T) and that are not (F)
is_between_dates = (merged_window['measuredate'] <= merged_window['obsdate'])

# Set values outside this valid window to NaN (because they are missing if they are not in the window!)
merged_window.loc[~is_between_dates, 'smoking_status'] = np.nan

merged_window['days_diff'] = abs(merged_window['measuredate'] - merged_window['obsdate']).dt.days
merged_window = merged_window.sort_values(['e_patid', 'obsdate', 'days_diff'], ascending = [True, True, True])
merged_window = merged_window.drop_duplicates(subset = ['e_patid'], keep = 'first')

merged_window = merged_window[["e_patid", "smoking_status", "measuredate"]]
merged_outcome = pd.merge(outcome, merged_window, on = 'e_patid', how = 'left')

DIR = 'cleaned_files\\Closest'
os.chdir(DIR)

merged_outcome.to_csv('smoking_status_no_duplicates_merged.csv', index = False)

merged_outcome['smoking_status'].isna().sum()

### Reduce missingness for smoking by categorizing terms

In [None]:
# get information about which patients have a missing smoking status
smoking_comp = merged_outcome[merged_outcome['smoking_status'].isna()].e_patid
smoking_comp = smoking_comp.to_frame()

# go to original smoking frame to get associated term/medcodeid
smoking_null = smoking[smoking['e_patid'].isin(smoking_comp['e_patid'])]

# based on term, create sub-frame of patient IDs that you will later index merged_outcome with 
smoking_null_current = smoking_null[smoking_null.term == 'Current smoker'].e_patid
smoking_null_former = smoking_null[smoking_null.term == 'Former smoker'].e_patid
smoking_null_non = smoking_null[smoking_null.term == 'Non-smoker'].e_patid

# now populate merged outcome accordingly
merged_outcome.loc[merged_outcome['e_patid'].isin(smoking_null_non), 'smoking_status'] = 0
merged_outcome.loc[merged_outcome['e_patid'].isin(smoking_null_current), 'smoking_status'] = 1
merged_outcome.loc[merged_outcome['e_patid'].isin(smoking_null_former), 'smoking_status'] = 2

# now assume that the rest of patients without a recorded value for smoking are non-smokers
merged_outcome.loc[merged_outcome['smoking_status'].isna(), 'smoking_status'] = 0

# save file under new file name (so we still have old version, if needed)
DIR = 'cleaned_files\\Closest'
os.chdir(DIR)

merged_outcome.to_csv('smoking_status_no_duplicates_merged_REDUCED.csv', index = False)