In [1]:
import pandas as pd

In [2]:
hh_cols = ['hhlivingsituation', 'hhnumdependentchildren', 'hhvehiclenumber', 'familyincomelevel', 'HmTTS2006']
hh_rename = ['Liv_Arr', 'Children', 'Cars', 'Income', 'Home_Zone']
df = pd.read_csv('Households.csv', low_memory=False)[hh_cols]
df = df.rename(columns = dict(zip(hh_cols, hh_rename)))
df.head()

Unnamed: 0,Liv_Arr,Children,Cars,Income,Home_Zone
0,Live with family/parents,0.0,0.0,,3851.0
1,Live with family/parents,0.0,1.0,,181.0
2,Live with family/parents,0.0,2.0,I don't know,1039.0
3,,,,,191.0
4,Live with family/parents,1.0,1.0,,600.0


In [3]:
ps_cols = ['psinstitution', 'personstatusgrad ', 'personstatustime', 'pscollegeaffiliation', 'pscampusmain',
          'psworknumhoursperweek', 'psdrivinglicenseowner', 'psmainmodefalltypical', 'psage']
ps_rename = ['School', 'Uni_Level', 'Uni_Status', 'Col_Status', 'Campus', 'Work', 'Licence', 'Mode', 'Age']
df = df.join(pd.read_csv('Respondents.csv', low_memory=False)[ps_cols])
df = df.rename(columns = dict(zip(ps_cols, ps_rename)))
df.head()

Unnamed: 0,Liv_Arr,Children,Cars,Income,Home_Zone,School,Uni_Level,Uni_Status,Col_Status,Campus,Work,Licence,Mode,Age
0,Live with family/parents,0.0,0.0,,3851.0,Centennial College,,,Full-Time,Progress Campus,I don't work,No,GO Bus,26.0
1,Live with family/parents,0.0,1.0,,181.0,Centennial College,,,Part-Time,Morningside Campus,I don't work,No,Transit Bus,25.0
2,Live with family/parents,0.0,2.0,I don't know,1039.0,Centennial College,,,Full-Time,Progress Campus,I don't work,No,GO Bus,19.0
3,,,,,191.0,Centennial College,,,Full-Time,Progress Campus,I don't work,No,,25.0
4,Live with family/parents,1.0,1.0,,600.0,Centennial College,,,Full-Time,Progress Campus,I don't work,No,,97.0


In [4]:
df['Campus'] = df.apply(lambda x: x.School if x.School in ('Ryerson University', 'OCAD University') else x.Campus, axis=1)
df['School_Type'] = df['School'].apply(lambda x: 'College' if 'College' in x else 'University')
df['Col_Status'].replace({'Full-Time': 'FT', 'Part-Time ':'PT', 'Continuing Education': 'Other'}, inplace=True)
df['Work'].replace({"I don't work": 'NW', 'I work 11-20 hours per week': 'PT', 'I work < 11 hours per week': 'PT', 'I work 21-30 hours per week': 'PT', 'I work 31-40 hours per week': 'FT', 'I work > 40 hours per week': 'FT'}, inplace=True)
df['Licence'] = df['Licence'].apply(lambda x: True if x=="Yes" else (False if x=="No" else None))
df['Family'] = df['Liv_Arr'].apply(lambda x: (x == 'Live with family/parents') if isinstance(x, str) else None)

In [5]:
def row_to_level(x):
    return x.Uni_Level if isinstance(x.Uni_Level, str) else ("College" if x.School_Type == 'College' else None)
def row_to_status(x):
    return x.Uni_Status if isinstance(x.Uni_Status, str) else (x.Col_Status if isinstance(x.Col_Status, str) else None)

df['Level'] = df.apply(row_to_level, axis=1)
df['Status'] = df.apply(row_to_status, axis=1)
df.drop(columns=['Uni_Level', 'Uni_Status', 'Col_Status'], inplace=True)

In [6]:
active = ['Walk', 'Bicycle', 'Bikeshare']
auto = ['Drive alone', 'Auto passenger (driver is a household member)', 'Drive with passenger(s) (household members only)',
        'Accessibility adapted vehicle as driver', 'Drive with passenger(s) (including non-household members)', 'Accessibility adapted vehicle as passenger',
       'Ride-hailing alone (UberX, Lyft etc.)', 'Auto passenger (driver is a non-household member)', 'Taxi', 'Ride-hailing with other passengers (Uberpool, Lyftpool etc.)']
transit = ['Transit Bus', 'Subway/RT', 'GO Train', 'GO Bus', 'Paratransit service (e.g., bus transport for disabled persons)', 'Streetcar']
def mode_to_aggr(mode):
    if mode in active:
        return 'Active'
    elif mode in auto:
        return 'Auto'
    elif mode in transit:
        return 'Transit'
    elif mode == 'I do not travel to the university (distance learners only)':
        return 'Do not travel'
    elif isinstance(mode, str):
        return 'Other'
    else:
        return None
    
df['Mode'] = df['Mode'].apply(mode_to_aggr)

In [7]:
df = df[(df['Status'] == 'FT') | (df['Status'] == 'PT')]
df = df[df['Campus'].notnull()]
df = df[df['Mode'] != 'Do not travel']
df = df[df['Home_Zone'].notnull()]

In [8]:
income_low = ['Less than $ 14,999', '$ 15,000 - 29,999', '$ 30,000 - 39,999', '$ 40,000 - 49,999',
              '$ 50,000 - 59,999', '$ 60,000 - 69,999', '$ 70,000 - 79,999', '$ 80,000 - 89,999']
income_high = ['$ 90,000 - 99,999', '$ 100,000 - 124,999', '$ 125,000 - 149,999', '$ 150,000 - 199,999', '$ 200,000 +']

def income_to_range(inc):
    if inc in income_low:
        return 'Low'
    elif inc in income_high:
        return 'High'
    else:
        return 'Unknown'
df['Income'] = df['Income'].apply(income_to_range)

In [9]:
# Load enrollment data
campus_info = pd.read_csv('Campus_Info.csv', index_col=0)
df['School'] = df['Campus'].apply(lambda x: campus_info['Code'].loc[x] if isinstance(x, str) else -1)
df['Campus_Zone'] = df['Campus'].apply(lambda x: campus_info['Zone'].loc[x] if isinstance(x, str) else -1)
campus_info = campus_info.set_index('Code')
zones, codes = list(campus_info['Zone']), list(campus_info.index)

In [10]:
def get_enrol(code):
    return campus_info['Total'].loc[code]

for code in codes:
    df['Enrol.' + code] = get_enrol(code)

In [11]:
df_path = pd.read_csv('../../../LoS/Walk_Distances.csv')
origins = list(set(list(df_path['Origin'])))
dists = list(df_path['Data'])

In [12]:
def get_distance(o, d):
    if o in origins and d in origins:
        i = origins.index(o)
        j = origins.index(d)
        return dists[i*2392 + j] / 1000
    else:
        return -1    

In [13]:
for i in range(len(codes)):
    df['Dist.' + codes[i]] = df['Home_Zone'].apply(lambda x: get_distance(x, zones[i]))
df = df[df['Dist.CST'] != -1]

In [14]:
df['Closest'] = df[['Dist.' + code for code in codes]].idxmin(axis=1).apply(lambda x: x.split('.')[1])
for code in codes:
    df['Closest.' + code] = (df['Closest'] == code).astype(int)
df['Closest.OTN'] = df['Closest.DOS'] #DOS and OTN same zone
df['Closest.MOI'] = df['Closest.MCM'] #MCM and MOI same zone

df[['Closest.' + code for code in codes]].sum()

Closest.CST     334
Closest.CAS     698
Closest.CPR    1068
Closest.CMO     292
Closest.CDV     458
Closest.CEG     652
Closest.CPI     449
Closest.CDS     296
Closest.DOS     866
Closest.DWH     375
Closest.MCM     199
Closest.MCB     405
Closest.MOF     997
Closest.MOS     343
Closest.MOI     199
Closest.OTD      89
Closest.OTN     866
Closest.SHD    1054
Closest.SHH     474
Closest.SHT     328
Closest.MI     1347
Closest.SC      387
Closest.SG     1283
Closest.YK     1879
Closest.YG      401
Closest.RY      831
Closest.OC     1011
dtype: int64

In [15]:
df.columns

Index(['Liv_Arr', 'Children', 'Cars', 'Income', 'Home_Zone', 'School',
       'Campus', 'Work', 'Licence', 'Mode', 'Age', 'School_Type', 'Family',
       'Level', 'Status', 'Campus_Zone', 'Enrol.CST', 'Enrol.CAS', 'Enrol.CPR',
       'Enrol.CMO', 'Enrol.CDV', 'Enrol.CEG', 'Enrol.CPI', 'Enrol.CDS',
       'Enrol.DOS', 'Enrol.DWH', 'Enrol.MCM', 'Enrol.MCB', 'Enrol.MOF',
       'Enrol.MOS', 'Enrol.MOI', 'Enrol.OTD', 'Enrol.OTN', 'Enrol.SHD',
       'Enrol.SHH', 'Enrol.SHT', 'Enrol.MI', 'Enrol.SC', 'Enrol.SG',
       'Enrol.YK', 'Enrol.YG', 'Enrol.RY', 'Enrol.OC', 'Dist.CST', 'Dist.CAS',
       'Dist.CPR', 'Dist.CMO', 'Dist.CDV', 'Dist.CEG', 'Dist.CPI', 'Dist.CDS',
       'Dist.DOS', 'Dist.DWH', 'Dist.MCM', 'Dist.MCB', 'Dist.MOF', 'Dist.MOS',
       'Dist.MOI', 'Dist.OTD', 'Dist.OTN', 'Dist.SHD', 'Dist.SHH', 'Dist.SHT',
       'Dist.MI', 'Dist.SC', 'Dist.SG', 'Dist.YK', 'Dist.YG', 'Dist.RY',
       'Dist.OC', 'Closest', 'Closest.CST', 'Closest.CAS', 'Closest.CPR',
       'Closest.CMO', 'Cl

In [16]:
df.to_csv('Formatted.csv', index = False)