In [1]:
import pandas as pd

ps_cols = ['pscampusattend', 'personstatusgrad', 'personstatustime','psmainmodefalltypical', 'pscmpgender','psdrivinglicenseownerflag','psworknumhoursperweek','psage']
hh_cols = ['HmTTS2006', 'hhlivingsituation', 'hhcarnumber','hhnumyoungerthan18','hhnumolderorequalto18','hhincomelevel']

ps_df = pd.read_csv('Respondents.csv')[ps_cols]
hh_df = pd.read_csv('Households.csv')[hh_cols]

print(ps_df.shape)
print(hh_df.shape)

(15226, 8)
(15226, 6)


In [2]:
rename_cols = {'HmTTS2006': 'HomeZone', 'pscampusattend': 'Campus', 'hhlivingsituation': 'Family', 'psmainmodefalltypical': 'Mode_Actual', 
                       'personstatusgrad': 'Level', 'personstatustime': 'Status', 'psuniversityinvolvednumyears': 'Years', 'hhcarnumber': 'Cars',
                       'hhincomelevel': 'Income', 'pscmpgender': 'Gender', 'psdrivinglicenseownerflag': 'Licence', 'psworknumhoursperweek': 'Work',
                       'hhnumyoungerthan18': 'Children', 'hhnumolderorequalto18':'Adults', 'psage': 'Age'}

df = pd.concat((ps_df, hh_df), axis=1).rename(columns=rename_cols)
df.head()

Unnamed: 0,Campus,Level,Status,Mode_Actual,Gender,Licence,Work,Age,HomeZone,Family,Cars,Children,Adults,Income
0,Scarborough (UTSC),UG,FT,Transit Bus,Female,0,,20,261.0,Live with family/parents,1,3,4,Unknown
1,Downtown Toronto (St. George),Grad,FT,Walk,Female,1,,25,71.0,Live with partner,0,0,2,"$ 90,000 - 119,999"
2,Downtown Toronto (St. George),UG,FT,Transit Bus,Female,1,,23,3714.0,Live with family/parents,1,0,4,Unknown
3,Downtown Toronto (St. George),UG,FT,Walk,Male,1,,20,74.0,Live with roommates,0,0,4,Unknown
4,Downtown Toronto (St. George),Grad,FT,Walk,Male,1,,27,71.0,Live with partner,0,0,2,"$ 30,000 - 59,999"


In [3]:
print(df.shape)
df = df[df['Mode_Actual'] != 'I do not travel to the university (distance learners only)']
df = df[df['Level'] != 'Other']
df['Work'] = df['Work'].fillna('Unknown')
df = df.dropna() # Remove rows with missing home zone, campus, or living arrangement
print(df.shape)

(15226, 14)
(14653, 14)


In [4]:
auto_modes = ("Car - Driver alone", "Car - Driver with passenger(s)", "Car - Passenger", "Taxi")
transit_modes = ("Transit Bus", "Streetcar", "Subway/RT", "GO Bus", "GO Train")
active_modes = ("Walk", "Bicycle")

def aggregate_mode(m):
    if m in auto_modes:
        return "Auto"
    elif m in transit_modes:
        return "Transit"
    elif m in active_modes:
        return "Active"
    else:
        return "Other"
    
df['Mode'] = df['Mode_Actual'].apply(aggregate_mode)
df['Mode'].value_counts()

Transit    9335
Active     3810
Auto       1384
Other       124
Name: Mode, dtype: int64

In [5]:
income_low = ("Less than $ 30,000", "$ 30,000 - 59,999")
income_high = ("$ 60,000 - 89,999", "$ 90,000 - 119,999", "$ 120,000 - 149,999", "$ 150,000 - 179,999", "$ 180,000 - 209,999", "$ 210,000 - 239,999", "$ 240,000 +")

def income_level(x):
    if x in income_low:
        return "Low"
    elif x in income_high:
        return "High"
    else:
        return "Unknown"

df['Income'] = df["Income"].apply(income_level)
df['Income'].value_counts()

Unknown    8745
Low        3414
High       2494
Name: Income, dtype: int64

In [6]:
work_ft = ("Yes, I work > 40 hours per week", "Yes, I work 31-40 hours per week")

def empl_status(x):
    if x.startswith("Yes, I work part time"):
        return "PT"
    elif x.startswith("Yes"):
        return "FT"
    elif x.startswith("No"):
        return "NW"
    else:
        return "Unknown"

df["Work"].apply(empl_status).value_counts()

Unknown    11271
PT          1582
NW          1563
FT           237
Name: Work, dtype: int64

In [7]:
df['Family'] = (df['Family'] == "Live with family/parents").astype(int)
df['Family'].value_counts()

1    8327
0    6326
Name: Family, dtype: int64

In [8]:
def row_to_segment(x):    
    return {((x.Level == 'UG') & (x.Status == 'FT') & x.Family): 1,
            ((x.Level == 'UG') & (x.Status == 'FT') & (not x.Family)): 2,
            ((x.Level == 'UG') & (x.Status == 'PT')): 3,
            ((x.Level == 'Grad') & (x.Status == 'FT') & x.Family): 4,
            ((x.Level == 'Grad') & (x.Status == 'FT') & (not x.Family)): 5,
            ((x.Level == 'Grad') & (x.Status == 'PT')): 6}.get(True, 0)

df['Segment'] = df.apply(row_to_segment, axis=1)
df.head()

Unnamed: 0,Campus,Level,Status,Mode_Actual,Gender,Licence,Work,Age,HomeZone,Family,Cars,Children,Adults,Income,Mode,Segment
0,Scarborough (UTSC),UG,FT,Transit Bus,Female,0,Unknown,20,261.0,1,1,3,4,Unknown,Transit,1
1,Downtown Toronto (St. George),Grad,FT,Walk,Female,1,Unknown,25,71.0,0,0,0,2,High,Active,5
2,Downtown Toronto (St. George),UG,FT,Transit Bus,Female,1,Unknown,23,3714.0,1,1,0,4,Unknown,Transit,1
3,Downtown Toronto (St. George),UG,FT,Walk,Male,1,Unknown,20,74.0,0,0,0,4,Unknown,Active,2
4,Downtown Toronto (St. George),Grad,FT,Walk,Male,1,Unknown,27,71.0,0,0,0,2,Low,Active,5


In [9]:
PD_df = pd.read_csv('../Zones.csv', index_col=0)

def zone_to_PD(x):
    try:
        return PD_df['PD'][x]
    except:
        return -1

df['PD'] = df['HomeZone'].apply(lambda x: zone_to_PD(x))
df['PD'].value_counts().head()

1     3095
36    1427
2      962
35     720
31     644
Name: PD, dtype: int64

In [10]:
campus_info = pd.read_csv('Campus_Info.csv', index_col=0)
campus_zones = list(campus_info['Zone'])
school_codes = list(campus_info['Code'])
campus_info

Unnamed: 0_level_0,Code,Zone,Tuition,Domestic_UG,Domestic_Grad,Admission,Total,UG,Grad
School,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Downtown Toronto (St. George),SG,69,7519,0.808,0.8369,0.893,53930,39293,14637
Scarborough (UTSC),SC,566,7813,0.8378,0.8406,0.841,11770,11494,276
Mississauga (UTM),MI,3631,7670,0.8282,0.6981,0.83,13298,12706,592
Keele,YK,391,7339,0.8921,0.8077,0.817,41142,37263,3879
Glendon,YG,225,7339,0.8921,0.8077,0.817,2457,2341,116
RyersonU,RY,38,7026,0.9673,0.8837,0.84,28159,26027,2132
OCADu,OC,67,7052,0.8998,0.6786,0.824,3491,3318,173


In [11]:
df['HomeZone'] = pd.to_numeric(df['HomeZone'], downcast='signed')
df['School'] = df['Campus'].apply(lambda x: campus_info['Code'][x]) # School codes
df.head()

Unnamed: 0,Campus,Level,Status,Mode_Actual,Gender,Licence,Work,Age,HomeZone,Family,Cars,Children,Adults,Income,Mode,Segment,PD,School
0,Scarborough (UTSC),UG,FT,Transit Bus,Female,0,Unknown,20,261,1,1,3,4,Unknown,Transit,1,6,SC
1,Downtown Toronto (St. George),Grad,FT,Walk,Female,1,Unknown,25,71,0,0,0,2,High,Active,5,1,SG
2,Downtown Toronto (St. George),UG,FT,Transit Bus,Female,1,Unknown,23,3714,1,1,0,4,Unknown,Transit,1,36,SG
3,Downtown Toronto (St. George),UG,FT,Walk,Male,1,Unknown,20,74,0,0,0,4,Unknown,Active,2,1,SG
4,Downtown Toronto (St. George),Grad,FT,Walk,Male,1,Unknown,27,71,0,0,0,2,Low,Active,5,1,SG


In [12]:
# Dataframe with walk distances
dists_df = pd.read_csv('../../../LoS/Walk_Distances.csv')
zones = dists_df['Origin'].unique().tolist()
WalkingDistances = list(dists_df['Data'])

# Dataframe with AutoTravelTimes
df_att = pd.read_csv('../../../LoS/Auto_Travel_Times.csv')
AutoTravelTimes = list(df_att['Data'])

# Dataframe with TransitTravelTimes
df_ttt = pd.read_csv('../../../LoS/Transit_Travel_Times.csv')
TransitTravelTimes = list(df_ttt['Data'])

# Function for distance lookup
def get_distance(o, d, mode):
    """
    Get distance (in m) or travel time (in min) between
    origin zone o and destination zone d
    from EMME LoS matrices.
    mode: 0 for network distance, 1 for auto travel time, 2 for transit travel time
    """
    lookup_list = (WalkingDistances, AutoTravelTimes, TransitTravelTimes)[mode]
    try:
        i = zones.index(o)
        j = zones.index(d)
        return lookup_list[i*2392 + j]
    except ValueError:
        return -1

# Load distances into dataframe
for i in range(len(campus_zones)):
    df["Dist." + school_codes[i]] = df['HomeZone'].apply(lambda x: get_distance(x, campus_zones[i], 0) / 1000)
for i in range(len(campus_zones)):
    df["AIVTT." + school_codes[i]] = df['HomeZone'].apply(lambda x: get_distance(x, campus_zones[i], 1))
for i in range(len(campus_zones)):
    df["TPTT." + school_codes[i]] = df['HomeZone'].apply(lambda x: get_distance(x, campus_zones[i], 2))

df = df[df['Dist.SG'] != -0.001] # Zones outside LoS limits
df.head()

Unnamed: 0,Campus,Level,Status,Mode_Actual,Gender,Licence,Work,Age,HomeZone,Family,...,AIVTT.YG,AIVTT.RY,AIVTT.OC,TPTT.SG,TPTT.SC,TPTT.MI,TPTT.YK,TPTT.YG,TPTT.RY,TPTT.OC
0,Scarborough (UTSC),UG,FT,Transit Bus,Female,0,Unknown,20,261,1,...,15.3011,24.89705,27.11227,73.276483,75.468478,166.103953,197.858689,64.076936,75.379996,81.104413
1,Downtown Toronto (St. George),Grad,FT,Walk,Female,1,Unknown,25,71,0,...,19.49373,5.748302,5.553657,24.128386,78.020223,113.869667,144.674281,66.918003,31.74921,27.804764
2,Downtown Toronto (St. George),UG,FT,Transit Bus,Female,1,Unknown,23,3714,1,...,66.09152,68.81667,66.16529,124.64318,157.56192,74.324184,118.92711,164.67029,113.95266,115.35252
3,Downtown Toronto (St. George),UG,FT,Walk,Male,1,Unknown,20,74,0,...,22.30349,4.76811,3.258073,16.675728,88.246135,120.463453,152.561525,77.003957,21.273948,19.897948
4,Downtown Toronto (St. George),Grad,FT,Walk,Male,1,Unknown,27,71,0,...,19.49373,5.748302,5.553657,24.128386,78.020223,113.869667,144.674281,66.918003,31.74921,27.804764


In [13]:
# Load enrollment data
def get_stat(col, school):
    return campus_info.loc[school][col]

schoolnames = list(campus_info.index)
for stat in ('Total', 'UG', 'Grad', 'Tuition', 'Domestic_UG', 'Domestic_Grad', 'Admission'):
    for i in range(len(campus_zones)):
        df[stat + "." + school_codes[i]] = get_stat(stat, schoolnames[i])
        
df.columns

Index(['Campus', 'Level', 'Status', 'Mode_Actual', 'Gender', 'Licence', 'Work',
       'Age', 'HomeZone', 'Family', 'Cars', 'Children', 'Adults', 'Income',
       'Mode', 'Segment', 'PD', 'School', 'Dist.SG', 'Dist.SC', 'Dist.MI',
       'Dist.YK', 'Dist.YG', 'Dist.RY', 'Dist.OC', 'AIVTT.SG', 'AIVTT.SC',
       'AIVTT.MI', 'AIVTT.YK', 'AIVTT.YG', 'AIVTT.RY', 'AIVTT.OC', 'TPTT.SG',
       'TPTT.SC', 'TPTT.MI', 'TPTT.YK', 'TPTT.YG', 'TPTT.RY', 'TPTT.OC',
       'Total.SG', 'Total.SC', 'Total.MI', 'Total.YK', 'Total.YG', 'Total.RY',
       'Total.OC', 'UG.SG', 'UG.SC', 'UG.MI', 'UG.YK', 'UG.YG', 'UG.RY',
       'UG.OC', 'Grad.SG', 'Grad.SC', 'Grad.MI', 'Grad.YK', 'Grad.YG',
       'Grad.RY', 'Grad.OC', 'Tuition.SG', 'Tuition.SC', 'Tuition.MI',
       'Tuition.YK', 'Tuition.YG', 'Tuition.RY', 'Tuition.OC',
       'Domestic_UG.SG', 'Domestic_UG.SC', 'Domestic_UG.MI', 'Domestic_UG.YK',
       'Domestic_UG.YG', 'Domestic_UG.RY', 'Domestic_UG.OC',
       'Domestic_Grad.SG', 'Domestic_Grad.SC',

In [14]:
temp = pd.get_dummies('Closest.' + df[['Dist.' + code for code in school_codes]].idxmin(axis = 1).str[-2:])
df = pd.concat((df, temp), axis=1)

In [15]:
# Travel times for mode choice
def wtt(x):
    return x['Dist.' + x.School] * 15
def aivtt(x):
    return x['AIVTT.' + x.School]
def tptt(x):
    return x['TPTT.' + x.School]

df['Time.Active'] = df.apply(wtt, axis=1)
df['Time.Auto'] = df.apply(aivtt, axis=1)
df['Time.Transit'] = df.apply(tptt, axis=1)

In [16]:
exp_factors = {}
for i in range(len(campus_zones)):
    exp_factors[school_codes[i]] = [campus_info.loc[schoolnames[i]]['UG'] / campus_info['UG'].sum() / df['Campus'].value_counts(normalize=True)[schoolnames[i]]]
exp_factors  

{'SG': [0.7265468174707322],
 'SC': [1.1699027271049984],
 'MI': [1.4935120961043935],
 'YK': [1.320828067342245],
 'YG': [0.812407084215121],
 'RY': [1.050650554272892],
 'OC': [0.7971642739405231]}

In [17]:
def get_exp_factor(code):
    return exp_factors[code]

df['Exp_Factor'] = df['School'].apply(lambda x: exp_factors[x])

In [18]:
df.to_csv('Formatted.csv', index = False)