# Summary
Clean and merge raw csv files into one file for analysis

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re

# Demographics

In [None]:
# read demographics
df = pd.read_csv('../Data/demographics.csv')

## Age

In [None]:
# calculate age using date of birth
df['Date_of_Birth'] = df['Date_of_Birth']/-365
df['Age'] = df[['Date_of_Birth','Age']].fillna(method='ffill',axis=1)['Age']

## Sex

In [None]:
# Encode sex
df['Sex'] = df['Sex'].replace({'Male':1,'Female':0})

## Race

In [None]:
# Encode race
df['race'] = df[['Race_Americ_Indian_Alaska_Native', 'Race_Asian', 'Race_Black_African_American', 'Race_Hawaiian_Pacific_Islander', 'Race_Caucasian', 'Race_Other']].fillna(method='ffill',axis=1)['Race_Other']
df.loc[df['race']==1,
       ['Race_Americ_Indian_Alaska_Native', 
        'Race_Asian', 'Race_Black_African_American', 'Race_Hawaiian_Pacific_Islander', 
        'Race_Caucasian', 'Race_Other']] = df.loc[df['race']==1, 
                                                  ['Race_Americ_Indian_Alaska_Native', 'Race_Asian', 'Race_Black_African_American', 'Race_Hawaiian_Pacific_Islander', 'Race_Caucasian', 'Race_Other']].fillna(0)

In [None]:
# clean up columns
df = df.drop(columns=['Demographics_Delta', 'Date_of_Birth','Ethnicity','Race_Unknown','Race_Other_Specify','race'])
# drop entries with missing age or sex
df = df.dropna(subset=['Age','Sex'])

In [None]:
df.to_csv('ALSdatacleaned.csv')

# ALS history

In [None]:
# read ALS history data
history = pd.read_csv('../Data/AlsHistory.csv')

In [None]:
# rename columns
history = history.rename({'Site_of_Onset___Bulbar':'site_bulbar',
                          'Site_of_Onset___Limb':'site_limb',
                          'Site_of_Onset___Spine':'site_spine',
                          'Site_of_Onset___Other':'site_other'}, axis=1)
# one hot encoding site of onset
history.loc[history['Site_of_Onset']=='Onset: Limb','site_limb']=1
history.loc[history['Site_of_Onset']=='Onset: Bulbar','site_bulbar']=1
history.loc[history['Site_of_Onset']=='Onset: Limb and Bulbar','site_limb']=1 
history.loc[history['Site_of_Onset']=='Onset: Limb and Bulbar','site_bulbar']=1
history.loc[history['Site_of_Onset']=='Onset: Other','site_other']=1
history.loc[history['Site_of_Onset']=='Onset: Spine','site_spine']=1

In [None]:
# clean up columns
history = history.drop(columns=['Site_of_Onset___Other_Specify','Site_of_Onset___Limb_and_Bulbar','Subject_ALS_History_Delta','Disease_Duration','Symptom','Symptom_Other_Specify','Location',
                                'Location_Other_Specify','Site_of_Onset'])
history = history.drop_duplicates()
history = history.groupby('subject_id').agg(np.nanmean).reset_index()
history['site'] = history[['site_bulbar','site_limb','site_other','site_spine']].fillna(method='ffill',axis=1)['site_spine']

In [None]:
# drop entries with missing onset site
history = history.dropna(subset=['site'])
history[['site_bulbar','site_limb','site_other','site_spine']] = history[['site_bulbar','site_limb','site_other','site_spine']].fillna(0)
history = history.drop(columns='site')

In [None]:
# merge history data with demographics
df = df.merge(history, how='left',on='subject_id')

In [None]:
df = df.dropna(subset=['site_limb']) # drop subjects with no site of onset info

In [None]:
df.to_csv('ALSdatacleaned.csv')

# Riluzole

In [None]:
# read riluzole use data
riluzole = pd.read_csv('../Data/Riluzole.csv')

In [None]:
# Assume that Riluzole use delta is the assessment date, not when subject started using riluzole. Hence we will only look at the 'Subject_used_Riluzole' column
riluzole['RiluzoleUse'] = riluzole['Subject_used_Riluzole'].replace({'Yes':1,'No':0})
df = df.merge(riluzole[['subject_id','RiluzoleUse']], how='left',on='subject_id')

In [None]:
df.to_csv('ALSdatacleaned.csv')

# Vital signs

In [None]:
# read vital signs data
vitalsigns = pd.read_csv('../Data/VitalSigns.csv')

In [None]:
# Drop irrelevant columns
vitalsigns = vitalsigns.drop(columns=['Temperature','Temperature_Units','Baseline_Standing_BP_Diastolic','Baseline_Standing_BP_Systolic',
                                      'Baseline_Supine_BP_Diastolic','Baseline_Supine_BP_Systolic','Baseline_Weight','Endpoint_Supine_Pulse',
                                      'Endpoint_Standing_Pulse','Baseline_Supine_Pulse','Baseline_Standing_Pulse','Endpoint_Weight',
                                      'Endpoint_Standing_BP_Diastolic','Endpoint_Standing_BP_Systolic','Endpoint_Supine_BP_Diastolic',
                                      'Endpoint_Supine_BP_Systolic'])

## Clean measurement units

In [None]:
# Change height to cm
vitalsigns.loc[vitalsigns['Height_Units']=='Inches','Height'] = vitalsigns.loc[vitalsigns['Height_Units']=='Inches','Height']*2.54

In [None]:
# Change weight to kg
vitalsigns.loc[vitalsigns['Weight_Units']=='Pounds','Weight'] = vitalsigns.loc[vitalsigns['Weight_Units']=='Pounds','Weight']*0.45359237

In [None]:
# Resolve outliers
vitalsigns[vitalsigns['Weight']>160]
vitalsigns.loc[(vitalsigns['subject_id']==456990) & (vitalsigns['Vital_Signs_Delta']==98),'Weight'] = vitalsigns.loc[(vitalsigns['subject_id']==456990) & (vitalsigns['Vital_Signs_Delta']==98),'Weight']*0.45359237

In [None]:
# Drop units columns
vitalsigns = vitalsigns.drop(columns=['Blood_Pressure_Diastolic_Units','Blood_Pressure_Systolic_Units','Height_Units','Pulse_Units',
                                      'Respiratory_Rate_Units','Weight_Units'])

## Fill missing values from other columns/rows

For mising pulse and BP, fill with average of supine and standing pulse/BP

In [None]:
vitalsigns['pulse_avg'] = vitalsigns[['Supine_Pulse','Standing_Pulse']].mean(axis=1)
vitalsigns['bp_dia_avg'] = vitalsigns[['Supine_BP_Diastolic','Standing_BP_Diastolic']].mean(axis=1)
vitalsigns['bp_sys_avg'] = vitalsigns[['Supine_BP_Systolic','Standing_BP_Systolic']].mean(axis=1)

In [None]:
vitalsigns['Pulse'] = vitalsigns[['pulse_avg','Pulse']].fillna(method='ffill',axis=1)['Pulse']
vitalsigns['BP_Diastolic'] = vitalsigns[['bp_dia_avg','Blood_Pressure_Diastolic']].fillna(method='ffill',axis=1)['Blood_Pressure_Diastolic']
vitalsigns['BP_Systolic'] = vitalsigns[['bp_sys_avg','Blood_Pressure_Systolic']].fillna(method='ffill',axis=1)['Blood_Pressure_Systolic']

In [None]:
vitalsigns = vitalsigns.drop(columns=['Blood_Pressure_Diastolic','Blood_Pressure_Systolic','Supine_Pulse','Standing_Pulse',
                                      'Supine_BP_Diastolic','Supine_BP_Systolic','Standing_BP_Diastolic','Standing_BP_Systolic',
                                      'pulse_avg','bp_dia_avg','bp_sys_avg'])

In [None]:
# 3 subjects with very different height values across deltas
vitalsigns.groupby('subject_id').std()[vitalsigns.groupby('subject_id').std()['Height'] > 4]
# For these subjects, pick the last delta=0 measurement
vitalsigns.loc[vitalsigns['subject_id']==315843,'Height'] = 188
vitalsigns.loc[vitalsigns['subject_id']==482961,'Height'] = 175
vitalsigns.loc[vitalsigns['subject_id']==759362,'Height'] = 154

In [None]:
# For the rest of the subjects, height is the mean of all height measurements
vitalsigns['Height'] = vitalsigns.groupby('subject_id')['Height'].transform(np.nanmean)

In [None]:
vitalsigns = vitalsigns.drop_duplicates()

## Merge height

Merge height first since height should be the same regardless of delta

In [None]:
height = vitalsigns[['subject_id','Height']].drop_duplicates()
df = df.merge(height, how='left',on='subject_id')
df.to_csv('ALSdatacleaned.csv')

## Further clean vital signs

Drop entries with no delta, drop height column, than save all dynamic variables in another file

In [None]:
vitalsigns = vitalsigns.dropna(subset=['Vital_Signs_Delta'])
vitalsigns = vitalsigns.drop(columns=['Height'])
vitalsigns = vitalsigns.rename({'Vital_Signs_Delta':'delta'},axis=1)
vitalsigns = vitalsigns.reset_index(drop=True)
vitalsigns = vitalsigns.drop([21378]) #outlier weight from subject 323840

In [None]:
# Average values from entries with same delta
vitalsigns = vitalsigns.groupby(['subject_id','delta']).agg(np.nanmean).reset_index()

In [None]:
vitalsigns.to_csv('ALS_dynamicvars.csv')

# FVC  

In [None]:
# Read FVC data
fvc = pd.read_csv('../Data/Fvc.csv')

In [None]:
fvc = fvc.rename({'Forced_Vital_Capacity_Delta':'delta'},axis=1)
fvc = fvc.dropna(subset=['delta'])
fvc.loc[(fvc['subject_id']==56109) & (fvc['delta']==131),'Subject_Normal'] = 2.38 # correct outlier

In [None]:
# average fvc from multiple trials
fvc['FVC_abs'] = fvc[['Subject_Liters_Trial_1','Subject_Liters_Trial_2','Subject_Liters_Trial_3']].apply(np.nanmean,axis=1)
fvc['FVC_perc'] = fvc[['pct_of_Normal_Trial_1','pct_of_Normal_Trial_2','pct_of_Normal_Trial_3']].apply(np.nanmean,axis=1)

In [None]:
fvc = fvc.dropna(subset=['FVC_abs','FVC_perc'], how='all') # drop entries with no fvc measurements

In [None]:
# Change all FVC_perc to proportion 

# For entries with normal, recompute fvc perc
fvc.loc[fvc['Subject_Normal'].notnull(), 'FVC_perc'] = fvc.loc[fvc['Subject_Normal'].notnull(), 'FVC_abs']/fvc.loc[fvc['Subject_Normal'].notnull(), 'Subject_Normal'] 

# For entries with no normal and fvc is in %
fvc.loc[(fvc['Subject_Normal'].isnull()) & (fvc['FVC_perc']>1.5), 'FVC_perc'] = fvc.loc[(fvc['Subject_Normal'].isnull()) & (fvc['FVC_perc']>1.5), 'FVC_perc']/100

In [None]:
fvc = fvc.rename({'Subject_Normal':'FVC_normal'},axis=1)
fvc = fvc[['subject_id','delta','FVC_abs','FVC_normal','FVC_perc']]

In [None]:
# Average values from entries with same delta
fvc = fvc.groupby(['subject_id','delta']).agg(np.nanmean).reset_index()

In [None]:
# merge with vital signs
vitalsigns = vitalsigns.merge(fvc, on=['subject_id','delta'], how='outer')

In [None]:
vitalsigns.to_csv('ALS_dynamicvars.csv')

# Labs

In [None]:
# Read labs data
labs = pd.read_csv('../Data/Labs.csv')

## Get selected labs

In [None]:
labs['Test_Name'] = labs['Test_Name'].replace('Bilirubin (total)','Bilirubin (Total)')

In [None]:
selected_labs = ['ALT(SGPT)','AST(SGOT)','Uric Acid','Blood Urea Nitrogen (BUN)','Albumin','Absolute Neutrophil Count','Protein','CK','Total Cholesterol',
                 'Triglycerides','HbA1c (Glycated Hemoglobin)','Hemoglobin','Hematocrit','White Blood Cell (WBC)','Red Blood Cells (RBC)','Creatinine',
                 'Sodium','Potassium','Chloride','Glucose','Platelets','Absolute Eosinophil Count','Alkaline Phosphatase','Bicarbonate','Calcium',
                 'Absolute Lymphocyte Count','Absolute Monocyte Count','Absolute Basophil Count','Bilirubin (Total)', 'Bilirubin (Direct)',
                 'Bilirubin (Indirect)','Gamma-glutamyltransferase','Lymphocytes','Monocytes','Basophils','Phosphorus','Eosinophils']

In [None]:
# look at selected labs only
labs = labs[labs['Test_Name'].isin(selected_labs)]

## Fix units

In [None]:
# drop all non-numeric test results
labs['Test_Result'] = labs['Test_Result'].replace(r'(?i)[a-z,+<>\-\s]','', regex=True).replace('',np.nan).astype(float)
labs = labs.dropna(subset=['Test_Result'])

In [None]:
# drop albumin readings with % unit
labs = labs.drop(labs[(labs['Test_Name']=='Albumin') & (labs['Test_Unit']=='%')].index)

In [None]:
# standardize units for basophil
labs.loc[(labs['Test_Name']=='Absolute Basophil Count') & (labs['Test_Unit']=='10E12/L'),'Test_Result'] = labs.loc[(labs['Test_Name']=='Absolute Basophil Count') 
                                                                                                                  & (labs['Test_Unit']=='10E12/L'),'Test_Result']*1e-3

In [None]:
# standardize units for red blood cells
labs.loc[(labs['Test_Name']=='Red Blood Cells (RBC)') & (labs['Test_Unit']=='x10E12/L'),'Test_Result'] = labs.loc[(labs['Test_Name']=='Red Blood Cells (RBC)') 
                                                                                                                  & (labs['Test_Unit']=='x10E12/L'),'Test_Result']*1e-3

In [None]:
# drop these recordings as they tend to be repeats and are out of range
labs = labs.reset_index(drop=True)
labs = labs.drop(labs[(labs['Test_Name']=='Red Blood Cells (RBC)') & (labs['Test_Result']>1e4) & (labs['Test_Result']<1e5)].index)

In [None]:
labs = labs.reset_index(drop=True)
labs = labs.drop(labs[(labs['Test_Unit'].isnull()) & (labs['Test_Name']=='Protein')].index) # drop rows with no protein unit as the measurement is not the same
labs = labs.drop(labs[(labs['Test_Unit'].isnull()) & (labs['Test_Name']=='Platelets')].index)
labs = labs.reset_index(drop=True)

## Reshape dataframe

In [None]:
# rename lab names
labs_names = ['ALT','AST','UricAcid','BUN','Albumin','AbsNeutroCount','Protein','CK','TotCholesterol',
                 'Triglycerides','HbA1c','Hb','Hematocrit','WBC','RBC','Creatinine',
                 'Sodium','Potassium','Chloride','Glucose','Platelets','AbsEosinophil','AlkalinePhosphatase','Bicarbonate','Calcium',
                 'AbsLymphocyte','AbsMonocyte','AbsBasophil','BilirubinTotal', 'BilirubinDirect',
                 'BilirubinIndirect','GGT','PercLymphocytes','PercMonocytes','PercBasophils','Phosphorus','PercEosinophils']
labs['Test_Name'] = labs['Test_Name'].replace({key:value for key,value in zip(selected_labs, labs_names)})
labs = labs.rename({'Laboratory_Delta':'delta'},axis=1)
labs = labs.dropna(subset=['delta','Test_Result'])

In [None]:
labs_pivot = labs[labs['Test_Name']==labs_names[0]][['subject_id','delta','Test_Result']].rename({'Test_Result':labs_names[0]},axis=1)
labs_pivot = labs_pivot.groupby(['subject_id','delta']).agg(np.nanmean).reset_index()
for name in labs_names[1:]:
    lab_tmp = labs[labs['Test_Name']==name][['subject_id','delta','Test_Result']].rename({'Test_Result':name},axis=1)
    lab_tmp = lab_tmp.groupby(['subject_id','delta']).agg(np.nanmean)
    labs_pivot = labs_pivot.merge(lab_tmp, how='outer', on=['subject_id','delta'])

## Check and clean values

In [None]:
# convert assumed unit discrepancies
labs_pivot.loc[labs_pivot['Hematocrit']<1, 'Hematocrit'] = labs_pivot.loc[labs_pivot['Hematocrit']<1, 'Hematocrit']*100
labs_pivot.loc[labs_pivot['Glucose']<1, 'Glucose'] = labs_pivot.loc[labs_pivot['Glucose']<1, 'Glucose']*100
labs_pivot.loc[labs_pivot['Platelets']>1000, 'Platelets'] = labs_pivot.loc[labs_pivot['Platelets']>1000, 'Platelets']/1000
labs_pivot.loc[labs_pivot['Potassium']>20, 'Potassium'] = labs_pivot.loc[labs_pivot['Potassium']>20, 'Potassium']/10

labs_pivot.loc[labs_pivot['RBC']==500,'RBC'] = 5000
labs_pivot.loc[labs_pivot['RBC']==4.63,'RBC'] = 4630
labs_pivot.loc[labs_pivot['RBC']==0.0345,'RBC'] = 3450
labs_pivot.loc[labs_pivot['RBC']==0.0345,'RBC'] = 3450
labs_pivot.loc[labs_pivot['RBC']<0.01,'RBC'] = labs_pivot.loc[labs_pivot['RBC']<0.01,'RBC']*1e6
labs_pivot.loc[(labs_pivot['RBC']>1e6) & (labs_pivot['RBC']<1e7),'RBC'] = labs_pivot.loc[(labs_pivot['RBC']>1e6) & (labs_pivot['RBC']<1e7),'RBC']*1e-3
labs_pivot.loc[(labs_pivot['RBC']>1e9) & (labs_pivot['RBC']<1e10),'RBC'] = labs_pivot.loc[(labs_pivot['RBC']>1e9) & (labs_pivot['RBC']<1e10),'RBC']*1e-6

In [None]:
# Remove outliers
labs_pivot = labs_pivot.reset_index(drop=True)
for name in labs_names:
    outliers = labs_pivot[labs_pivot[name]>labs_pivot[name].mean()+50*labs_pivot[name].std()][['subject_id','delta',name]]
    if len(outliers)>0:
        print(outliers)
        labs_pivot.loc[outliers.index, name] = np.nan

In [None]:
# do it again
for name in labs_names:
    outliers = labs_pivot[labs_pivot[name]>labs_pivot[name].mean()+50*labs_pivot[name].std()][['subject_id','delta',name]]
    if len(outliers)>0:
        print(outliers)
        labs_pivot.loc[outliers.index, name] = np.nan

In [None]:
labs_pivot.to_csv('ALS_labs_cleaned.csv')

## Merge with vitalsigns

In [None]:
vitalsigns = pd.read_csv('ALS_dynamicvars.csv',index_col=[0])
vitalsigns = vitalsigns.merge(labs_pivot, on=['subject_id','delta'], how='outer')
vitalsigns.to_csv('ALS_dynamicvars.csv')

# ALSFRS

In [None]:
# Read ALSFRS data
alsfrs = pd.read_csv('../Data/alsfrs.csv')

In [None]:
# Drop unused columns
alsfrs = alsfrs.drop(columns=['Mode_of_Administration','ALSFRS_Responded_By'])

In [None]:
# rename columns
alsfrs.columns = ['subject_id','alsfrs_q1','alsfrs_q2','alsfrs_q3','alsfrs_q4','alsfrs_q5a','alsfrs_q5b','alsfrs_q6','alsfrs_q7','alsfrs_q8','alsfrs_q9','alsfrs_q10',
                  'delta','alsfrs','alsfrsr','alsfrs_r1','alsfrs_r2','alsfrs_r3']

In [None]:
# average duplicates
alsfrs = alsfrs.groupby(['subject_id','delta']).agg(np.nanmean).reset_index()

In [None]:
# drop entries with no alsfrs and alsfrsr
alsfrs = alsfrs.dropna(subset=['alsfrs','alsfrsr'], how='all')
alsfrs = alsfrs.reset_index(drop=True)

In [None]:
# merge with other dynamic vars
vitalsigns = vitalsigns.merge(alsfrs, on=['subject_id','delta'], how='outer')

In [None]:
vitalsigns.to_csv('ALS_dynamicvars.csv')

# Merge static and dynamic vars

## Add and interpolate age and ALS history across deltas

In [None]:
df = pd.read_csv('ALS_staticvars.csv', index_col=[0])
vitalsigns = pd.read_csv('ALS_dynamicvars.csv', index_col=[0])

In [None]:
# Add age to dynamic vars
age = df[['subject_id','Age']]
age['delta'] = 0
age.head()

In [None]:
for subject in vitalsigns['subject_id'].unique():
    if subject in age['subject_id'].unique():
        vitalsigns.loc[vitalsigns['subject_id']==subject,'Age'] = age.loc[age['subject_id']==subject, 'Age'].values[0] + (vitalsigns.loc[vitalsigns['subject_id']==subject,'delta']/365)

In [None]:
for subject in vitalsigns['subject_id'].unique():
    if subject in df['subject_id'].unique():
        vitalsigns.loc[vitalsigns['subject_id']==subject,'Onset_Delta'] = df.loc[df['subject_id']==subject,'Onset_Delta'].values[0] - vitalsigns.loc[vitalsigns['subject_id']==subject,'delta']
        vitalsigns.loc[vitalsigns['subject_id']==subject,'Diagnosis_Delta'] = df.loc[df['subject_id']==subject,'Diagnosis_Delta'].values[0] - vitalsigns.loc[vitalsigns['subject_id']==subject,'delta']

In [None]:
vitalsigns.to_csv('ALS_dynamicvars.csv')

## Merge

In [None]:
# some cleaning before merging
df['site_limb'] = df['site_limb'] + df['site_spine']

In [None]:
df = df.drop(columns=['Age', 'site_spine', 'site_other', 'Race_Other', 'Onset_Delta', 'Diagnosis_Delta']) # drop race_other because it means all the other races == 0, same for site_other

In [None]:
df = df.merge(vitalsigns, on='subject_id', how='left')

In [None]:
df = df.reindex(columns=['subject_id', 'delta', 'Age', 'Sex', 'Race_Americ_Indian_Alaska_Native', 'Race_Asian',
                         'Race_Black_African_American', 'Race_Hawaiian_Pacific_Islander',
                         'Race_Caucasian', 'site_bulbar', 'site_limb','Onset_Delta', 'Diagnosis_Delta', 'RiluzoleUse', 'Height', 'Weight',
                         'Pulse', 'Respiratory_Rate', 'BP_Diastolic',
                         'BP_Systolic', 'FVC_abs', 'FVC_normal', 'FVC_perc', 'ALT', 'AST',
       'UricAcid', 'BUN', 'Albumin', 'AbsNeutroCount', 'Protein', 'CK',
       'TotCholesterol', 'Triglycerides', 'HbA1c', 'Hb', 'Hematocrit', 'WBC',
       'RBC', 'Creatinine', 'Sodium', 'Potassium', 'Chloride', 'Glucose',
       'Platelets', 'AbsEosinophil', 'AlkalinePhosphatase', 'Bicarbonate',
       'Calcium', 'AbsLymphocyte', 'AbsMonocyte', 'AbsBasophil',
       'BilirubinTotal', 'BilirubinDirect', 'BilirubinIndirect', 'GGT',
       'PercLymphocytes', 'PercMonocytes', 'PercBasophils', 'Phosphorus',
       'PercEosinophils', 'alsfrs_q1', 'alsfrs_q2', 'alsfrs_q3', 'alsfrs_q4',
       'alsfrs_q5a', 'alsfrs_q5b', 'alsfrs_q6', 'alsfrs_q7', 'alsfrs_q8',
       'alsfrs_q9', 'alsfrs_q10', 'alsfrs', 'alsfrsr', 'alsfrs_r1',
       'alsfrs_r2', 'alsfrs_r3'])

In [None]:
# drop subjects with only one delta
df = df[~df['subject_id'].isin(df.groupby('subject_id')['delta'].count()[df.groupby('subject_id')['delta'].count()<=1].index)]

In [None]:
# drop subjects with no single measurement of alsfrs or alsfrsr
tmp = df.groupby('subject_id')[['alsfrs','alsfrsr']].agg(np.nanmean)
df = df[~df['subject_id'].isin(tmp[tmp['alsfrs'].isnull() & tmp['alsfrsr'].isnull()].index)]

In [None]:
df = df.sort_values(['subject_id','delta'])
df = df.reset_index(drop=True)

## Clean and add additional vars

BMI

In [None]:
df['BMI'] = df['Weight']/((df['Height']/100)**2)

In [None]:
df.to_csv('ALSdatacleaned.csv')