In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta 
import statistics
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("~/Documents/github/paper/input/data.csv", low_memory=False)
features = ['eye_id', 'is_irf_present', 'is_srf_present',
            'next_interval_in_weeks', 'admission_date', 'actual_drug_today',
            'visual_acuity', 'ur', 'laterality']
df.rename(columns={"NextInt": "next_interval_in_weeks", "Drug": "actual_drug_today",
                   "Laterality": "laterality"}, inplace=True)
df = df[features]

def id_cleaner(df):
    id_list = df['eye_id'].unique()
    frames, i = [], 0
    for eye in id_list:
        pdf = df[df.eye_id == eye]
        pdf_left = pdf[pdf.laterality == 'Left']
        pdf_right = pdf[pdf.laterality == 'Right']
        pdf_left.eye_id = i
        pdf_right.eye_id = i + 1
        if len(pdf_left) != 0:
            frames.append(pdf_left)
            i += 1
        if len(pdf_right) != 0:
            frames.append(pdf_right)
            i += 1
    return pd.concat(frames)

def time_sort(df):
    id_list = df.eye_id.unique()
    frames = []
    for eye in id_list:
        pdf = df[df.eye_id == eye]
        pdf.admission_date = pd.to_datetime(pdf.admission_date, dayfirst=True)
        pdf.sort_values(by='admission_date', inplace=True)
        frames.append(pdf)
    return pd.concat(frames)

def raw_data(df):
    cleaned_df = time_sort(id_cleaner(df))
    cleaned_df.reset_index(inplace=True, drop=True)
    #cleaned_df.drop(columns=['next_interval_in_weeks', 'ur', 'laterality', 'id'], inplace=True)
    #cleaned_df.rename(columns={"eye_id": "id"}, inplace=True)
    cleaned_df.replace(['nil', np.nan], inplace=True)
    cleaned_df["actual_drug_today"].replace({"nil": np.nan}, inplace=True)
    cleaned_df.head()
    return cleaned_df

In [6]:
len(df)

31382

In [8]:
24994/len(df)

0.7964438212988337

In [5]:
df.isna().sum()

eye_id                        0
is_irf_present            24980
is_srf_present            24994
next_interval_in_weeks    12904
admission_date                0
actual_drug_today          3527
visual_acuity               776
ur                            0
laterality                    0
dtype: int64

In [30]:
cleaned_df = raw_data(df)
cleaned_df.to_csv("raw_data_cleaned.csv", index=False)

def patient_cutoff(df, cutoff_year, cutoff_visits):
    # patients must have this many years of data to be included.
    frames = []
    id_list = df.eye_id.unique()
    for eye in id_list:
        pdf = df[df.eye_id == eye]
        dates = (pd.to_datetime(pdf.admission_date)).to_list()
        if ((dates[-1] - dates[0]).days)/365 >= cutoff_year and len(pdf)>=cutoff_visits: 
            frames.append(pdf)
    return pd.concat(frames)

def cut_time(df, cutoff_time):
        # shortens a patient's dataframe to x years after initiation.
        frames = []
        id_list = df.eye_id.unique()
        for eye in id_list:
            pdf = df[df.eye_id == eye]
            pdf.admission_date = pd.to_datetime(pdf.admission_date)
            dates = pdf['admission_date'].to_list()
            first = pd.to_datetime(dates[0])
            cutoff = first + timedelta(days=cutoff_time*365)
            pdf = pdf[pdf['admission_date'] <= cutoff]
            if len(pdf) > 0: frames.append(pdf)
        return pd.concat(frames)
    
def impute_pdf(df):
    fill_NaN = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    imputed_df = pd.DataFrame(fill_NaN.fit_transform(df))
    imputed_df.columns = df.columns
    imputed_df.index = df.index
    imputed_df.fillna(0, inplace=True)
    return imputed_df

def column_names(i):
    return [f'va_{i}', f'irf_{i}', f'srf_{i}', f'int_{i}']

def column_builder(i):
    lst = []
    for visits in range(1, i+1):
        lst.extend(column_names(visits))
    lst.append('mean_vision'), lst.append('std_vision')
    lst.append('target_va')
    lst.remove('int_1')
    return lst

def reshape_pdf(pdf, n_visits):
    pdf["is_irf_present"] = pdf["is_irf_present"].astype(int)
    pdf["is_srf_present"] = pdf["is_srf_present"].astype(int)
    nums, columns = [], column_builder(n_visits)
    pdf.fillna(0, inplace=True)
    for i in range(n_visits): 
        nums.append(pdf.visual_acuity.iloc[i])
        nums.append(pdf.is_irf_present.iloc[i])
        nums.append(pdf.is_srf_present.iloc[i])
        if i != 0: nums.append((pdf.admission_date.iloc[i] - pdf.admission_date.iloc[i-1]).days)
    if n_visits > 6: nums.append(np.mean(pdf.visual_acuity))
    else: nums.append(np.mean(pdf.visual_acuity.iloc[:n_visits+1]))
    if n_visits > 3: nums.append(np.std(pdf.visual_acuity))
    else: nums.append(np.std(pdf.visual_acuity.iloc[:n_visits+1]))
    #mode = statistics.mode(pdf.actual_drug_today)
    #if type(mode) == str: nums.append(mode)
    #else: nums.append("None")
    #nums.append(pdf.age.iloc[0])
    #nums.append(encode_gender(pdf.gender.iloc[0]))
    nums.append(pdf.visual_acuity.iloc[-1])
    return pd.DataFrame(data=[nums], columns=columns)

def encode_gender(g):
    return 0 if g == "Male" else 1

def reshape_df(df, n_visits):
    eyes = df.eye_id.unique()
    frames = []
    for eye in eyes:
        pdf = df[df.eye_id == eye]
        try: frames.append(reshape_pdf(pdf, n_visits))
        except: pass
    return pd.concat(frames)

def save_df_patients(n_years, n_visits=4, test=False):
    if test:
        df = pd.read_csv("raw_test_data_cleaned.csv")
        df.drop(columns=['actual_drug_today', 'next_interval_in_weeks', 'InjNext',
                 'laterality'], inplace=True)
        df.rename(columns={'eye_id': 'id'}, inplace=True)
        df["irf"] = 0
        df["srf"] = 0
    else: 
        df = pd.read_csv("raw_data_cleaned.csv")
    df = patient_cutoff(df, n_years, 4)
    df = cut_time(df, n_years)
    df = reshape_df(df, n_visits)
    #df = pd.get_dummies(df, columns=["mode_drug"])
    #df.drop(columns=['admission_date'], inplace=True)
    if test: df.to_csv(f"test_{n_years}_years.csv", index=False)
    else: df.to_csv(f"df_{n_years}_years_{n_visits}_visits.csv", index=False)

In [31]:
save_df_patients(n_years=3, n_visits=2)
save_df_patients(n_years=3, n_visits=3)
save_df_patients(n_years=3, n_visits=4)
save_df_patients(n_years=3, n_visits=5)
save_df_patients(n_years=3, n_visits=6)
save_df_patients(n_years=3, n_visits=7)
save_df_patients(n_years=3, n_visits=8)

In [198]:
save_df_patients(1)
save_df_patients(2)
save_df_patients(3)

# Gender and age

In [167]:
ag = pd.read_csv("~/Documents/GitHub/paper/input/age_gender.csv")
ag

Unnamed: 0,ur,Gender,Age
0,18363,Female,74
1,ers21736,Male,80
2,ers27377,Male,73
3,ers22077,Male,84
4,ERS20676,Female,83
...,...,...,...
443,ERS29534,Female,87
444,ers29715,Male,84
445,ers29906,Female,72
446,ers25061,Male,93


In [168]:
np.median(ag.Age)

83.0

In [169]:
df['age'] = 90
df['gender'] = "Male"

In [82]:
urs = df.ur.unique()
frames = []
for ur in urs:
    pdf = df[df.ur == ur]
    ag_pdf = ag[ag.ur == ur]
    if len(ag_pdf) > 0:
        pdf.age = ag_pdf.Age.iloc[0]
        pdf.gender = ag_pdf.Gender.iloc[0]
    else: 
        pdf.age = np.mean(ag.Age)
        pdf.gender = "Female"
    frames.append(pdf)
df = pd.concat(frames)