## Data Loading and Transformation (Run Once)

In [1]:
import pandas as pd
import math
import csv

hh_df = pd.read_csv('../../Data/SMTO_2015/SMTO_2015_Households.csv')
ps_df = pd.read_csv('../../Data/SMTO_2015/SMTO_2015_Respondents.csv')
campus_info = pd.read_csv('../../Data/SMTO_2015/Campus_Info.csv')

# Load relevant columns
df = ps_df[['pscampusattend', 'personstatusgrad', 'personstatustime', 'psdrivinglicenseownerflag', 'psexpfactor']]
df = df.join(hh_df[['HmTTS2006', 'hhlivingsituation', 'hhcarnumber']])
df = df.rename(columns={'HmTTS2006': 'HomeZone', 'pscampusattend': 'Campus', 'hhlivingsituation': 'Family', 
                       'personstatusgrad': 'Level', 'personstatustime': 'Status', 'hhcarnumber': 'Cars',
                        'psdrivinglicenseownerflag': 'Licence', 'psexpfactor': 'Exp_SMTO'})
df = df.dropna() # Remove rows with missing data

In [2]:
# Convert Campus column to numerical column
df['Campus'] = df['Campus'].apply(lambda x: campus_info.index[campus_info['School'] == x].tolist()[0])

# Convert columns to numerical
mode_name_to_num = {"Car - Driver alone": 0, "Car - Driver with passenger(s)": 0, "Car - Passenger": 0, "Taxi": 0, "Transit Bus": 1, "Streetcar": 1, "Subway/RT": 1, "GO Bus": 1, "GO Train": 1, "Walk": 2, "Bicycle": 2}
df.replace({'Mode': mode_name_to_num}, inplace=True)
df['HomeZone'] = pd.to_numeric(df['HomeZone'], downcast='signed')
df['Family'] = (df['Family'] == 'Live with family/parents')*1

def miller_segments(x):
    if x.Level == 'Other':
        return 0
    elif x.Level == 'UG':
        return 3 if x.Status == 'PT' else (not x.Family) + 1
    else:
        return 6 if x.Status == 'PT' else (4 + (not x.Family))
df['Segment'] = df.apply(miller_segments, axis=1)

df

Unnamed: 0,Campus,Level,Status,Licence,Exp_SMTO,HomeZone,Family,Cars,Segment
0,1,UG,FT,0,9.70,261,1,1,1
1,0,Grad,FT,1,5.79,71,0,0,5
2,0,UG,FT,1,9.06,3714,1,1,1
3,0,UG,FT,1,14.67,74,0,0,2
4,0,Grad,FT,1,9.11,71,0,0,5
...,...,...,...,...,...,...,...,...,...
15221,3,UG,FT,1,12.60,212,0,2,2
15222,3,UG,FT,1,12.60,233,1,1,1
15223,3,UG,FT,1,12.60,95,0,0,2
15224,3,UG,FT,0,15.35,2221,1,2,1


In [3]:
# Load LoS Data
df_path = pd.read_csv('../../../LoS/Walk_Distances.csv')
origins, dists = list(set(list(df_path['Origin']))), list(df_path['Data'])
AutoTravelTimes = list(pd.read_csv('../../../LoS/Auto_Travel_Times.csv')['Data'])
TransitTravelTimes = list(pd.read_csv('../../../LoS/Transit_Travel_Times.csv')['Data'])

# Function for distance lookup
def load_columns(o, d):
    try:
        i = origins.index(o)
    except ValueError:
        return (-1, -1, -1)
    j = origins.index(d)
    return (AutoTravelTimes[i*2392 + j], TransitTravelTimes[i*2392 + j],
           dists[i*2392 + j]/1000)

# List of campus' TTS zones from Joven's MOE data
campus_zones = list(campus_info['Zone'])
school_codes = list(campus_info['Code'])

In [4]:
# Load times and distances into dataframe
for i in range(len(campus_zones)):
    df = df.reset_index()
    df = pd.concat([df, pd.DataFrame([load_columns(x, campus_zones[i]) for x in df['HomeZone']],
                                 columns = ['Time.Auto.' + str(i), 'Time.Transit.' + str(i), 'Dist.' + str(i)])], axis=1, join='inner')
    df['Time.Active.' + str(i)] = df['Dist.' + str(i)] * 15
    df = df.drop(columns=['index'])
df = df[df['Time.Auto.0'] != -1]
df = df[df['Time.Auto.0'] < 1000]
  
df.head()

Unnamed: 0,Campus,Level,Status,Licence,Exp_SMTO,HomeZone,Family,Cars,Segment,Time.Auto.0,...,Dist.4,Time.Active.4,Time.Auto.5,Time.Transit.5,Dist.5,Time.Active.5,Time.Auto.6,Time.Transit.6,Dist.6,Time.Active.6
0,1,UG,FT,0,9.7,261,1,1,1,27.32717,...,9.218413,138.276195,24.89705,75.379996,9.580635,143.709525,27.11227,81.104413,11.24173,168.62595
1,0,Grad,FT,1,5.79,71,0,0,5,2.924953,...,11.21115,168.16725,5.748302,31.74921,2.675173,40.127595,5.553657,27.804764,2.723838,40.85757
2,0,UG,FT,1,9.06,3714,1,1,1,50.17188,...,32.5552,488.328,51.37231,147.452825,24.964,374.46,48.72094,146.732427,23.68615,355.29225
3,0,UG,FT,1,14.67,74,0,0,2,1.049121,...,12.83041,192.45615,4.76811,21.273948,2.314008,34.71012,3.258073,19.897948,1.541276,23.11914
4,0,Grad,FT,1,9.11,71,0,0,5,2.924953,...,11.21115,168.16725,5.748302,31.74921,2.675173,40.127595,5.553657,27.804764,2.723838,40.85757


In [5]:
# Load enrollment data
enrollment_df = pd.read_csv('../../Data/Enrolment/Joven_Enrollment.csv').set_index('School')

def get_enrollment(level, school):
    return enrollment_df.loc[school][level] # Level: UG, Grad, Total

In [6]:
def load_exp_factors(x):
    level = 'Total' if x.Level == 'Other' else x.Level
    return df['Campus'].value_counts()[x.Campus]/len(df) / (get_enrollment(level, school_codes[x.Campus]) / enrollment_df[level].sum())
df['Exp_Level'] = df.apply(load_exp_factors, axis=1)
df['Exp_Level'].value_counts()

1.364827    3573
0.750083    2466
0.988646    2419
0.603213    2345
0.843011    1020
0.661200     858
1.186308     620
1.234995     403
1.257391     300
1.987047     288
1.064238     169
1.158119      76
2.336416      73
5.779972      56
3.899651      52
0.791212      38
4.177767      18
1.395268      11
0.735777       8
0.958780       7
1.367044       3
Name: Exp_Level, dtype: int64

In [7]:
df = df.drop(columns=['Level', 'Status', 'HomeZone'])
df

Unnamed: 0,Campus,Licence,Exp_SMTO,Family,Cars,Segment,Time.Auto.0,Time.Transit.0,Dist.0,Time.Active.0,...,Time.Active.4,Time.Auto.5,Time.Transit.5,Dist.5,Time.Active.5,Time.Auto.6,Time.Transit.6,Dist.6,Time.Active.6,Exp_Level
0,1,0,9.70,1,1,1,27.327170,73.276483,10.256060,153.840900,...,138.276195,24.897050,75.379996,9.580635,143.709525,27.112270,81.104413,11.241730,168.625950,0.843011
1,0,1,5.79,0,0,5,2.924953,24.128386,1.132351,16.985265,...,168.167250,5.748302,31.749210,2.675173,40.127595,5.553657,27.804764,2.723838,40.857570,0.603213
2,0,1,9.06,1,1,1,50.171880,155.551170,23.319230,349.788450,...,488.328000,51.372310,147.452825,24.964000,374.460000,48.720940,146.732427,23.686150,355.292250,1.364827
3,0,1,14.67,0,0,2,1.049121,16.675728,0.699414,10.491210,...,192.456150,4.768110,21.273948,2.314008,34.710120,3.258073,19.897948,1.541276,23.119140,1.364827
4,0,1,9.11,0,0,5,2.924953,24.128386,1.132351,16.985265,...,168.167250,5.748302,31.749210,2.675173,40.127595,5.553657,27.804764,2.723838,40.857570,0.603213
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14983,3,1,12.60,0,2,2,16.111250,57.865490,7.120260,106.803900,...,85.994340,14.111770,56.139620,6.553092,98.296380,17.578440,67.146402,8.193741,122.906115,0.750083
14984,3,1,12.60,1,1,1,35.121280,94.497599,15.917590,238.763850,...,90.287700,32.658680,94.849368,15.242170,228.632550,34.026930,103.801969,16.903260,253.548900,0.750083
14985,3,1,12.60,0,0,2,6.632627,33.425859,2.783940,41.759100,...,192.444000,10.525700,44.089351,4.733398,71.000970,9.009337,41.253790,3.979057,59.685855,0.750083
14986,3,0,15.35,1,2,1,54.619020,132.043987,23.379880,350.698200,...,230.685600,52.447490,128.691955,23.250580,348.758700,54.426780,135.237541,24.518920,367.783800,0.750083


In [9]:
df.to_csv('Combined_Input.csv', index=False)