In [1]:
import pandas as pd
import math
import csv

hh_df = pd.read_csv('SMTO_2015/SMTO_2015_Households.csv')
ps_df = pd.read_csv('SMTO_2015/SMTO_2015_Respondents.csv')
campus_info = pd.read_csv('SMTO_2015/Campus_Info.csv')

# List of campus' TTS zones from Joven's MOE data
campus_zones = list(campus_info['Zone'])
school_codes = list(campus_info['Code'])

# List of University Information from OUAC
uni_info = pd.read_csv('Uni_Info.csv')
uni_info = uni_info.set_index('University')

# Load relevant columns
df = ps_df[['pscampusattend', 'personstatusgrad', 'personstatustime','psmainmodefalltypical', 'pscmpgender','psdrivinglicenseownerflag','psworknumhoursperweek','psage','psexpfactor']]
df = df.join(hh_df[['HmTTS2006', 'hhlivingsituation', 'hhcarnumber','hhnumyoungerthan18','hhnumolderorequalto18','hhincomelevel']])

df = df.rename(columns={'HmTTS2006': 'HomeZone', 'pscampusattend': 'Campus', 'hhlivingsituation': 'Family', 'psmainmodefalltypical': 'Mode_Actual', 
                       'personstatusgrad': 'Level', 'personstatustime': 'Status', 'psuniversityinvolvednumyears': 'Years', 'hhcarnumber': 'Cars',
                       'hhincomelevel': 'Income', 'pscmpgender': 'Gender', 'psdrivinglicenseownerflag': 'Licence', 'psworknumhoursperweek': 'Work',
                       'hhnumyoungerthan18': 'Children', 'hhnumolderorequalto18':'Adults', 'psage': 'Age','psexpfactor': 'Exp_SMTO'})


mode_name_to_num = {"Car - Driver alone": "Auto", "Car - Driver with passenger(s)": "Auto", "Car - Passenger": "Auto", "Taxi": "Auto", 
                    "Transit Bus": "Transit", "Streetcar": "Transit", "Subway/RT": "Transit", "GO Bus": "Transit", "GO Train": "Transit", 
                    "Walk": "Active", "Bicycle": "Active",
                    "Intercampus Shuttle": "Other", "Other": "Other", "Motorcycle, moped or scooter": "Other", "Paratransit": "Other"}
print(df.shape)

# Make Aggregate Mode Column
df['Mode'] = df['Mode_Actual'].replace(mode_name_to_num)
df = df[(df['Mode'] == "Auto") | (df['Mode'] == "Transit") | (df['Mode'] == "Active") | (df['Mode'] == "Other")]
print(df.shape)

df['Work'].fillna('Unknown', inplace=True) # Fill 11,000 Work NaNs with "Unknowns"
df = df.dropna() # Remove rows with missing data

# Convert Campus column to numerical column
df['CampusZone'] = df["Campus"].apply(lambda x: campus_info.Zone[campus_info['School'] == x].tolist()[0])

# Convert columns to numerical
df['HomeZone'] = pd.to_numeric(df['HomeZone'], downcast='signed')

# Add column with school codes
df['School_Codes'] = df['Campus'].apply(lambda x: uni_info['Code'][x])

df.head()

(15226, 15)
(15197, 16)


Unnamed: 0,Campus,Level,Status,Mode_Actual,Gender,Licence,Work,Age,Exp_SMTO,HomeZone,Family,Cars,Children,Adults,Income,Mode,CampusZone,School_Codes
0,Scarborough (UTSC),UG,FT,Transit Bus,Female,0,Unknown,20,9.7,261,Live with family/parents,1,3,4,Unknown,Transit,566,SC
1,Downtown Toronto (St. George),Grad,FT,Walk,Female,1,Unknown,25,5.79,71,Live with partner,0,0,2,"$ 90,000 - 119,999",Active,69,SG
2,Downtown Toronto (St. George),UG,FT,Transit Bus,Female,1,Unknown,23,9.06,3714,Live with family/parents,1,0,4,Unknown,Transit,69,SG
3,Downtown Toronto (St. George),UG,FT,Walk,Male,1,Unknown,20,14.67,74,Live with roommates,0,0,4,Unknown,Active,69,SG
4,Downtown Toronto (St. George),Grad,FT,Walk,Male,1,Unknown,27,9.11,71,Live with partner,0,0,2,"$ 30,000 - 59,999",Active,69,SG


In [2]:
df['Mode_Actual'].value_counts()

Subway/RT                         4101
Transit Bus                       3059
Walk                              2856
GO Train                          1162
Bicycle                           1032
Car - Driver alone                 896
GO Bus                             817
Streetcar                          394
Car - Passenger                    316
Car - Driver with passenger(s)     193
Intercampus Shuttle                 74
Other                               33
Motorcycle, moped or scooter        16
Taxi                                13
Paratransit                          3
Name: Mode_Actual, dtype: int64

In [3]:
# Column Transformation functions

def inc_transform(x):
    if x == "Less than $ 30,000":
        return "Low"
    elif x == "$ 30,000 - 59,999":
        return "Low"
    elif x == "$ 60,000 - 89,999":
        return "Low"
    elif x == "$ 90,000 - 119,999":
        return "High"
    elif x == "$ 120,000 - 149,999":
        return "High"
    elif x == "$ 150,000 - 179,999":
        return "High"
    elif x == "$ 180,000 - 209,999":
        return "High"
    elif x == "$ 210,000 - 239,999":
        return "High"
    elif x == "$ 240,000 +":
        return "High"
    else:
        return "Unknown"
   
    
def work_transform(x):
    if x == "Yes, I work > 40 hours per week":
        return "FT"
    elif x == "Yes, I work 31-40 hours per week":
        return "FT"
    elif x == "Yes, I work part time (21-30 hours per week)":
        return "PT"
    elif x == "Yes, I work part time (11-20 hours per week)":
        return "PT"
    elif x == "Yes, I work part time (<10 hours per week)":
        return "PT"
    elif x == "No, I don't work":
        return "NW"
    else:
        return "Unknown"
    
def fam_transform(x):
    if x == "Live with family/parents":
        return 1
    else:
        return 0
'''
def car_transform(x):
    if x == 0:
        return 0
    elif x == 1: 
        return 1
    else:
        return 2
'''

'\ndef car_transform(x):\n    if x == 0:\n        return 0\n    elif x == 1: \n        return 1\n    else:\n        return 2\n'

In [4]:
df['Work'] = df["Work"].apply(lambda x: work_transform(x))
df['Family'] = df["Family"].apply(lambda x: fam_transform(x))
#df['Cars'] = df["Cars"].apply(lambda x: car_transform(x))
df['Income'] = df["Income"].apply(lambda x: inc_transform(x))
df.head()

Unnamed: 0,Campus,Level,Status,Mode_Actual,Gender,Licence,Work,Age,Exp_SMTO,HomeZone,Family,Cars,Children,Adults,Income,Mode,CampusZone,School_Codes
0,Scarborough (UTSC),UG,FT,Transit Bus,Female,0,Unknown,20,9.7,261,1,1,3,4,Unknown,Transit,566,SC
1,Downtown Toronto (St. George),Grad,FT,Walk,Female,1,Unknown,25,5.79,71,0,0,0,2,High,Active,69,SG
2,Downtown Toronto (St. George),UG,FT,Transit Bus,Female,1,Unknown,23,9.06,3714,1,1,0,4,Unknown,Transit,69,SG
3,Downtown Toronto (St. George),UG,FT,Walk,Male,1,Unknown,20,14.67,74,0,0,0,4,Unknown,Active,69,SG
4,Downtown Toronto (St. George),Grad,FT,Walk,Male,1,Unknown,27,9.11,71,0,0,0,2,Low,Active,69,SG


In [5]:
def row_to_segment(x):
    
    return {((x.Level == 'UG') & (x.Status == 'FT') & x.Family): 1,
            ((x.Level == 'UG') & (x.Status == 'FT') & (not x.Family)): 2,
            ((x.Level == 'UG') & (x.Status == 'PT')): 3,
            ((x.Level == 'Grad') & (x.Status == 'FT') & x.Family): 4,
            ((x.Level == 'Grad') & (x.Status == 'FT') & (not x.Family)): 5,
            ((x.Level == 'Grad') & (x.Status == 'PT')): 6}.get(True, 0)

df['Segment'] = df.apply(row_to_segment, axis=1)

df.head()

Unnamed: 0,Campus,Level,Status,Mode_Actual,Gender,Licence,Work,Age,Exp_SMTO,HomeZone,Family,Cars,Children,Adults,Income,Mode,CampusZone,School_Codes,Segment
0,Scarborough (UTSC),UG,FT,Transit Bus,Female,0,Unknown,20,9.7,261,1,1,3,4,Unknown,Transit,566,SC,1
1,Downtown Toronto (St. George),Grad,FT,Walk,Female,1,Unknown,25,5.79,71,0,0,0,2,High,Active,69,SG,5
2,Downtown Toronto (St. George),UG,FT,Transit Bus,Female,1,Unknown,23,9.06,3714,1,1,0,4,Unknown,Transit,69,SG,1
3,Downtown Toronto (St. George),UG,FT,Walk,Male,1,Unknown,20,14.67,74,0,0,0,4,Unknown,Active,69,SG,2
4,Downtown Toronto (St. George),Grad,FT,Walk,Male,1,Unknown,27,9.11,71,0,0,0,2,Low,Active,69,SG,5


In [6]:
# Dataframe with walk distances
df_path = pd.read_csv('../../LoS/Walk_Distances.csv')
origins = list(set(list(df_path['Origin'])))
dists = list(df_path['Data'])

# Dataframe with AutoTravelTimes
df_att = pd.read_csv('../../LoS/Auto_Travel_Times.csv')
AutoTravelTimes = list(df_att['Data'])

# Dataframe with TransitTravelTimes
df_ttt = pd.read_csv('../../LoS/Transit_Travel_Times.csv')
TransitTravelTimes = list(df_ttt['Data'])

# Function for distance lookup
not_found = set()
def find_distance(origin, destination, info_num):
    # mode_num: 0 - to find Walking Distance
    #           1 - to find Walking Time
    #           2 - to find Auto Time  
    #           3 - to find Transit Time
    
    try:
        i = origins.index(origin)
    except ValueError:
        not_found.add(origin)
        return -1
    try:
        j = origins.index(destination)
    except ValueError:
        not_found.add(destination)
        return -1
    
    if info_num == 0:
        return dists[i*2392 + j] / 1000
    elif info_num == 1:
        return dists[i*2392 + j] * 15 / 1000 
    elif info_num == 2:
        return AutoTravelTimes[i*2392 + j]
    elif info_num == 3:
        return TransitTravelTimes[i*2392 + j]
    else:
        print("ERROR: Enter correct info_num!")
        return False

# Load distances into dataframe
for i in range(len(campus_zones)):
    df["Dist." + school_codes[i]] = df['HomeZone'].apply(lambda x: find_distance(x, campus_zones[i],0))
for i in range(len(campus_zones)):
    df["WTT." + school_codes[i]] = df['HomeZone'].apply(lambda x: find_distance(x, campus_zones[i],1))
for i in range(len(campus_zones)):
    df["AIVTT." + school_codes[i]] = df['HomeZone'].apply(lambda x: find_distance(x, campus_zones[i],2))
for i in range(len(campus_zones)):
    df["TPTT." + school_codes[i]] = df['HomeZone'].apply(lambda x: find_distance(x, campus_zones[i],3))
    
print("# of zones not found:", len(not_found))

df.head()

# of zones not found: 127


Unnamed: 0,Campus,Level,Status,Mode_Actual,Gender,Licence,Work,Age,Exp_SMTO,HomeZone,...,AIVTT.YG,AIVTT.RY,AIVTT.OC,TPTT.SG,TPTT.SC,TPTT.MI,TPTT.YK,TPTT.YG,TPTT.RY,TPTT.OC
0,Scarborough (UTSC),UG,FT,Transit Bus,Female,0,Unknown,20,9.7,261,...,15.3011,24.89705,27.11227,73.276483,75.468478,129.369386,197.858689,64.076936,75.379996,81.104413
1,Downtown Toronto (St. George),Grad,FT,Walk,Female,1,Unknown,25,5.79,71,...,19.49373,5.748302,5.553657,24.128386,78.020223,77.13505,144.674281,66.918003,31.74921,27.804764
2,Downtown Toronto (St. George),UG,FT,Transit Bus,Female,1,Unknown,23,9.06,3714,...,54.99243,51.37231,48.72094,155.55117,124.359872,42.439563,85.084262,186.874439,147.452825,146.732427
3,Downtown Toronto (St. George),UG,FT,Walk,Male,1,Unknown,20,14.67,74,...,22.30349,4.76811,3.258073,16.675728,88.246135,83.728866,152.561525,77.003957,21.273948,19.897948
4,Downtown Toronto (St. George),Grad,FT,Walk,Male,1,Unknown,27,9.11,71,...,19.49373,5.748302,5.553657,24.128386,78.020223,77.13505,144.674281,66.918003,31.74921,27.804764


In [7]:
# Load enrollment data
enrollment_df = pd.read_csv('Enrolment/Joven_Enrollment.csv').set_index('School')
def get_log_enrollment(level, school):
    return math.log1p(enrollment_df.loc[school][level]) # Level: UG, Grad, Total
    
for level in ('Total', 'UG', 'Grad'):
    for i in range(len(campus_zones)):
        df[level + "." + school_codes[i]] = get_log_enrollment(level, school_codes[i])

In [8]:
df.head()

Unnamed: 0,Campus,Level,Status,Mode_Actual,Gender,Licence,Work,Age,Exp_SMTO,HomeZone,...,UG.YG,UG.RY,UG.OC,Grad.SG,Grad.SC,Grad.MI,Grad.YK,Grad.YG,Grad.RY,Grad.OC
0,Scarborough (UTSC),UG,FT,Transit Bus,Female,0,Unknown,20,9.7,261,...,7.758761,10.166928,8.107419,9.591376,5.624018,6.385194,8.26359,4.762174,7.665285,5.159055
1,Downtown Toronto (St. George),Grad,FT,Walk,Female,1,Unknown,25,5.79,71,...,7.758761,10.166928,8.107419,9.591376,5.624018,6.385194,8.26359,4.762174,7.665285,5.159055
2,Downtown Toronto (St. George),UG,FT,Transit Bus,Female,1,Unknown,23,9.06,3714,...,7.758761,10.166928,8.107419,9.591376,5.624018,6.385194,8.26359,4.762174,7.665285,5.159055
3,Downtown Toronto (St. George),UG,FT,Walk,Male,1,Unknown,20,14.67,74,...,7.758761,10.166928,8.107419,9.591376,5.624018,6.385194,8.26359,4.762174,7.665285,5.159055
4,Downtown Toronto (St. George),Grad,FT,Walk,Male,1,Unknown,27,9.11,71,...,7.758761,10.166928,8.107419,9.591376,5.624018,6.385194,8.26359,4.762174,7.665285,5.159055


In [9]:
# One column per campus tuition
for i in range(len(campus_zones)):
    df["Tuition." + school_codes[i]] = uni_info['Tuition'][i]
    
# One single tuition col
#df['Tuition'] = df['Campus'].apply(lambda x: uni_info.loc[x]['Tuition'])

In [10]:
# Domestic Percentage Column
for i in range(len(campus_zones)):
    df["Domestic." + school_codes[i]] = df['Level'].apply(lambda x: uni_info['Domestic%_' + ('Grad' if (x == 'Grad') else 'UG')][i])

In [11]:
for i in range(len(campus_zones)):
    df["Admission_Avg." + school_codes[i]] = uni_info['Admission'][i]

In [12]:
list(uni_info.index)

['Downtown Toronto (St. George)',
 'Scarborough (UTSC)',
 'Mississauga (UTM)',
 'Keele',
 'Glendon',
 'RyersonU',
 'OCADu']

In [13]:
enrollment_df

Unnamed: 0_level_0,Total,UG,Grad
School,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SG,53930,39293,14637
SC,11770,11494,276
MI,13298,12706,592
YK,41142,37263,3879
YG,2457,2341,116
RY,28159,26027,2132
OC,3491,3318,173


In [14]:
exp_factors = {}
schoolnames = list(uni_info.index)
for i in range(len(campus_zones)):
    exp_factors[schoolnames[i]] = [enrollment_df.loc[school_codes[i]]['UG'] / enrollment_df['UG'].sum() / df[df['Level'] == 'UG']['Campus'].value_counts(normalize=True)[schoolnames[i]]]
    exp_factors[schoolnames[i]].append(enrollment_df.loc[school_codes[i]]['Grad'] / enrollment_df['Grad'].sum() / df[df['Level'] == 'Grad']['Campus'].value_counts(normalize=True)[schoolnames[i]]) 
    exp_factors[schoolnames[i]].append(enrollment_df.loc[school_codes[i]]['Total'] / enrollment_df['Total'].sum() / df[df['Level'] == 'Other']['Campus'].value_counts(normalize=True)[schoolnames[i]])
    
def load_exp_factors_segment(x):
    if x.Level == 'Other':
        return x.Exp_SMTO
    return exp_factors[x.Campus][0 if x.Level == 'UG' else 1]

def load_exp_factors_level(x):
    return {x.Level == 'UG': exp_factors[x.Campus][0], x.Level == 'Grad': exp_factors[x.Campus][1]}.get(True, exp_factors[x.Campus][2])

df['Exp_Segment'] = df.apply(load_exp_factors_segment, axis = 1)
df['Exp_Level'] = df.apply(load_exp_factors_level, axis = 1)
exp_factors

{'Downtown Toronto (St. George)': [0.9193065870406183,
  0.9895197769705163,
  1.4166989208319023],
 'Scarborough (UTSC)': [0.9497822284464283,
  0.7640417809898078,
  3.4010756402764026],
 'Mississauga (UTM)': [1.2368578048299124,
  1.3020715997650407,
  2.9886956202281616],
 'Keele': [1.247079700772638, 0.9839025572420779, 2.080478712714024],
 'Glendon': [0.6441796744570789, 1.0347168080715432, 0.45180427849782845],
 'RyersonU': [0.8989125445522926, 1.184473577930842, 0.345200289736002],
 'OCADu': [0.683093423300042, 0.5050326238769256, 2.353783217825954]}

In [15]:
df.head()

Unnamed: 0,Campus,Level,Status,Mode_Actual,Gender,Licence,Work,Age,Exp_SMTO,HomeZone,...,Domestic.OC,Admission_Avg.SG,Admission_Avg.SC,Admission_Avg.MI,Admission_Avg.YK,Admission_Avg.YG,Admission_Avg.RY,Admission_Avg.OC,Exp_Segment,Exp_Level
0,Scarborough (UTSC),UG,FT,Transit Bus,Female,0,Unknown,20,9.7,261,...,0.8998,0.893,0.841,0.83,0.817,0.817,0.84,0.824,0.949782,0.949782
1,Downtown Toronto (St. George),Grad,FT,Walk,Female,1,Unknown,25,5.79,71,...,0.6786,0.893,0.841,0.83,0.817,0.817,0.84,0.824,0.98952,0.98952
2,Downtown Toronto (St. George),UG,FT,Transit Bus,Female,1,Unknown,23,9.06,3714,...,0.8998,0.893,0.841,0.83,0.817,0.817,0.84,0.824,0.919307,0.919307
3,Downtown Toronto (St. George),UG,FT,Walk,Male,1,Unknown,20,14.67,74,...,0.8998,0.893,0.841,0.83,0.817,0.817,0.84,0.824,0.919307,0.919307
4,Downtown Toronto (St. George),Grad,FT,Walk,Male,1,Unknown,27,9.11,71,...,0.6786,0.893,0.841,0.83,0.817,0.817,0.84,0.824,0.98952,0.98952


In [16]:
# Remove columns we don't need anymore:
del df['CampusZone']
del df['Exp_SMTO']

In [17]:
for i in school_codes:
    df = df[df['WTT.' + i] != -1]

In [18]:
def wtt(x):
    return x['WTT.' + x.School_Codes]
def aivtt(x):
    return x['AIVTT.' + x.School_Codes]
def tptt(x):
    return x['TPTT.' + x.School_Codes]

df['Time.Active'] = df.apply(wtt, axis=1)
df['Time.Auto'] = df.apply(aivtt, axis=1)
df['Time.Transit'] = df.apply(tptt, axis=1)

In [19]:
df.shape

(14782, 92)

In [20]:
df.columns

Index(['Campus', 'Level', 'Status', 'Mode_Actual', 'Gender', 'Licence', 'Work',
       'Age', 'HomeZone', 'Family', 'Cars', 'Children', 'Adults', 'Income',
       'Mode', 'School_Codes', 'Segment', 'Dist.SG', 'Dist.SC', 'Dist.MI',
       'Dist.YK', 'Dist.YG', 'Dist.RY', 'Dist.OC', 'WTT.SG', 'WTT.SC',
       'WTT.MI', 'WTT.YK', 'WTT.YG', 'WTT.RY', 'WTT.OC', 'AIVTT.SG',
       'AIVTT.SC', 'AIVTT.MI', 'AIVTT.YK', 'AIVTT.YG', 'AIVTT.RY', 'AIVTT.OC',
       'TPTT.SG', 'TPTT.SC', 'TPTT.MI', 'TPTT.YK', 'TPTT.YG', 'TPTT.RY',
       'TPTT.OC', 'Total.SG', 'Total.SC', 'Total.MI', 'Total.YK', 'Total.YG',
       'Total.RY', 'Total.OC', 'UG.SG', 'UG.SC', 'UG.MI', 'UG.YK', 'UG.YG',
       'UG.RY', 'UG.OC', 'Grad.SG', 'Grad.SC', 'Grad.MI', 'Grad.YK', 'Grad.YG',
       'Grad.RY', 'Grad.OC', 'Tuition.SG', 'Tuition.SC', 'Tuition.MI',
       'Tuition.YK', 'Tuition.YG', 'Tuition.RY', 'Tuition.OC', 'Domestic.SG',
       'Domestic.SC', 'Domestic.MI', 'Domestic.YK', 'Domestic.YG',
       'Domestic.RY', 'Dome

In [21]:
df['Segment'].value_counts()

1    6879
2    3461
5    2250
4     856
3     692
6     340
0     304
Name: Segment, dtype: int64

In [22]:
df.to_csv('SMTO_2015/SMTO_2015_Complete_Input.csv', index = False)