In [1]:
import pandas as pd
df_2015 = pd.read_csv('../SMTO_2015/Formatted.csv')
df_2019 = pd.read_csv('../SMTO_2019/Formatted.csv')

In [2]:
df_2015['Source'] = 2015
df_2019['Source'] = 2019

df_2015['Source_Index'] = df_2015.index
df_2019['Source_Index'] = df_2019.index

df_2015['School_Type'] = 'University'
df_2015 = df_2015.rename(columns = {'HomeZone': 'Home_Zone'})
df_2019['Home_Zone'] = df_2019['Home_Zone'].astype(int)

temp_2015 = df_2015[['Source', 'Source_Index', 'School', 'School_Type', 'Mode', 'Home_Zone']]
temp_2019 = df_2019[['Source', 'Source_Index', 'School', 'School_Type', 'Mode', 'Home_Zone']]

combined_df = pd.concat((temp_2015, temp_2019), ignore_index=True)
combined_df

Unnamed: 0,Source,Source_Index,School,School_Type,Mode,Home_Zone
0,2015,0,SC,University,Transit,261
1,2015,1,SG,University,Active,71
2,2015,2,SG,University,Transit,3714
3,2015,3,SG,University,Active,74
4,2015,4,SG,University,Active,71
...,...,...,...,...,...,...
30989,2019,16511,YK,University,,2133
30990,2019,16512,YK,University,Active,398
30991,2019,16513,YK,University,Transit,2763
30992,2019,16514,YG,University,Active,214


In [3]:
df_2015['Family'] = df_2015['Family'].astype(bool)
df_2015['Licence'] = df_2015['Licence'].astype(bool)

temp_2019 = df_2019[['Family', 'Cars', 'Income', 'Level', 'Status', 'Age', 'Licence', 'Work']]
temp_2015 = df_2015[['Family', 'Cars', 'Income', 'Level', 'Status', 'Age', 'Licence', 'Work']]

temp_combined_df = pd.concat((temp_2015, temp_2019), ignore_index=True)
combined_df = pd.concat((combined_df, temp_combined_df), axis=1)
combined_df.head()

Unnamed: 0,Source,Source_Index,School,School_Type,Mode,Home_Zone,Family,Cars,Income,Level,Status,Age,Licence,Work
0,2015,0,SC,University,Transit,261,True,1.0,Unknown,UG,FT,20.0,False,Unknown
1,2015,1,SG,University,Active,71,False,0.0,High,Grad,FT,25.0,True,Unknown
2,2015,2,SG,University,Transit,3714,True,1.0,Unknown,UG,FT,23.0,True,Unknown
3,2015,3,SG,University,Active,74,False,0.0,Unknown,UG,FT,20.0,True,Unknown
4,2015,4,SG,University,Active,71,False,0.0,Low,Grad,FT,27.0,True,Unknown


In [4]:
dists_df = pd.read_csv('../../../LoS/Walk_Distances.csv')
zones = dists_df['Origin'].unique().tolist()
dists = dists_df['Data'].tolist()

def get_distance(o, d):
    if o in zones and d in zones:
        i = zones.index(o)
        j = zones.index(d)
        return dists[i*2392 + j] / 1000
    else:
        return -1   

In [5]:
campus_info = pd.read_csv('../SMTO_2019/Campus_Info.csv', index_col=1)
campus_info.head()

Unnamed: 0_level_0,Campus,Zone,Total,Source
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CST,Story Arts Centre,282,1425.0,Data
CAS,Ashtonbee Campus,526,2332.0,Data
CPR,Progress Campus,493,11871.0,Data
CMO,Morningside Campus,564,4035.776162,ASC
CDV,Downsview Campus,419,42.212872,ASC


In [6]:
schools = combined_df['School'].unique().tolist()

for school in schools:
    combined_df['Dist.' + school] = combined_df['Home_Zone'].apply(lambda x: get_distance(x, campus_info['Zone'].loc[school]))
for school in schools:
    combined_df['Enrol.' + school] = campus_info['Total'].loc[school]

closest = combined_df[['Dist.' + school for school in schools]].idxmin(axis=1).apply(lambda x: x.split('.')[1])    
for school in schools:
    combined_df['Closest.' + school] = closest == school

combined_df['Closest.OTN'] = combined_df['Closest.DOS'] #DOS and OTN same zone
combined_df['Closest.MOI'] = combined_df['Closest.MCM'] #MCM and MOI same zone

combined_df.columns

Index(['Source', 'Source_Index', 'School', 'School_Type', 'Mode', 'Home_Zone',
       'Family', 'Cars', 'Income', 'Level', 'Status', 'Age', 'Licence', 'Work',
       'Dist.SC', 'Dist.SG', 'Dist.MI', 'Dist.OC', 'Dist.RY', 'Dist.YK',
       'Dist.YG', 'Dist.CPR', 'Dist.CMO', 'Dist.CAS', 'Dist.CST', 'Dist.CDV',
       'Dist.CEG', 'Dist.CDS', 'Dist.CPI', 'Dist.DOS', 'Dist.DWH', 'Dist.MCM',
       'Dist.MCB', 'Dist.MOF', 'Dist.MOI', 'Dist.MOS', 'Dist.OTN', 'Dist.OTD',
       'Dist.SHT', 'Dist.SHD', 'Dist.SHH', 'Enrol.SC', 'Enrol.SG', 'Enrol.MI',
       'Enrol.OC', 'Enrol.RY', 'Enrol.YK', 'Enrol.YG', 'Enrol.CPR',
       'Enrol.CMO', 'Enrol.CAS', 'Enrol.CST', 'Enrol.CDV', 'Enrol.CEG',
       'Enrol.CDS', 'Enrol.CPI', 'Enrol.DOS', 'Enrol.DWH', 'Enrol.MCM',
       'Enrol.MCB', 'Enrol.MOF', 'Enrol.MOI', 'Enrol.MOS', 'Enrol.OTN',
       'Enrol.OTD', 'Enrol.SHT', 'Enrol.SHD', 'Enrol.SHH', 'Closest.SC',
       'Closest.SG', 'Closest.MI', 'Closest.OC', 'Closest.RY', 'Closest.YK',
       'Closest.YG'

In [7]:
combined_df.to_csv('Formatted.csv', index=False)