In [27]:
import pandas as pd
import math
import csv

hh_df = pd.read_csv('../../Data/SMTO_2015/SMTO_2015_Households.csv')
ps_df = pd.read_csv('../../Data/SMTO_2015/SMTO_2015_Respondents.csv')
campus_info = pd.read_csv('Campus_Info.csv')

# Load relevant columns
df = ps_df[['pscampusattend', 'personstatusgrad', 'personstatustime','psmainmodefalltypical', 'pscmpgender','psdrivinglicenseownerflag','psworknumhoursperweek']]
df = df.join(hh_df[['HmTTS2006', 'hhlivingsituation', 'hhcarnumber']])

#df = ps_df[['pscampusattend','psmainmodefalltypical']]
#df = df.join(hh_df[['HmTTS2006','hhincomelevel']])

df = df.rename(columns={'HmTTS2006': 'HomeZone', 'pscampusattend': 'Campus', 'hhlivingsituation': 'Family', 'psmainmodefalltypical': 'Mode', 
                       'personstatusgrad': 'Level', 'personstatustime': 'Status', 'psuniversityinvolvednumyears': 'Years',
                       'hhincomelevel': 'Income', 'pscmpgender': 'Gender', 'psdrivinglicenseownerflag': 'Licence', 'psworknumhoursperweek'})

#df = df.rename(columns={'HmTTS2006': 'HomeZone', 'pscampusattend': 'Campus', 'psmainmodefalltypical': 'Mode', 'hhincomelevel': 'Income'})

mode_name_to_num = {"Car - Driver alone": "Auto", "Car - Driver with passenger(s)": "Auto", "Car - Passenger": "Auto", "Taxi": "Auto", 
                    "Transit Bus": "Transit", "Streetcar": "Transit", "Subway/RT": "Transit", "GO Bus": "Transit", "GO Train": "Transit", 
                    "Walk": "Active", "Bicycle": "Active"}

df.replace({'Mode': mode_name_to_num}, inplace=True)
df = df[(df['Mode'] == "Auto") | (df['Mode'] == "Transit") | (df['Mode'] == "Active")]

df = df.dropna() # Remove rows with missing data

# Convert Campus column to numerical column
df['Campus'] = df["Campus"].apply(lambda x: campus_info.Zone[campus_info['School'] == x].tolist()[0])

# Convert columns to numerical
df['HomeZone'] = pd.to_numeric(df['HomeZone'], downcast='signed')
#df['Family'] = (df['Family'] == 'Live with family/parents')*1

df.head()

Unnamed: 0,Campus,personstatusgrad,personstatustime,Mode,pscmpgender,psdrivinglicenseownerflag,psworknumhoursperweek,HomeZone,hhlivingsituation,hhcarnumber
5,69,UG,FT,Active,Female,0,"Yes, I work part time (<10 hours per week)",72,Live with roommates,0
13,69,Grad,FT,Active,Female,1,"No, I don't work",50,Live with partner,1
19,69,UG,FT,Transit,Female,0,"Yes, I work part time (11-20 hours per week)",38,Live with roommates,0
25,69,Grad,PT,Active,Male,1,"Yes, I work > 40 hours per week",413,Live with family/parents,1
30,69,UG,FT,Active,Male,1,"No, I don't work",67,Live with roommates,0


In [2]:
def row_to_segment(x):
    if x.Level == 'Other':
        return 0
    elif x.Level == 'UG':
        return 3 if x.Status == 'PT' else (not x.Family) + 1
    else:
        return 6 if x.Status == 'PT' else (4 + (not x.Family))

df['Segment'] = df.apply(row_to_segment, axis=1)

In [15]:
# Dataframe with walk distances
df_path = pd.read_csv('../../../LoS/Walk_Distances.csv')
origins = list(set(list(df_path['Origin'])))
dists = list(df_path['Data'])

# Dataframe with AutoTravelTimes
df_att = pd.read_csv('../../../LoS/Auto_Travel_Times.csv')
AutoTravelTimes = list(df_att['Data'])

# Dataframe with TransitTravelTimes
df_ttt = pd.read_csv('../../../LoS/Transit_Travel_Times.csv')
TransitTravelTimes = list(df_ttt['Data'])

# Dataframe with TransitTravelTimes
df_ac = pd.read_csv('../../../LoS/Auto_Cost.csv')
AutoCosts = list(df_ac['Data'])

# Dataframe with TransitTravelTimes
df_tc = pd.read_csv('../../../LoS/Transit_Cost.csv')
TransitCosts = list(df_tc['Data'])

not_found = set()

# Function for distance/AutoTravelTime/TransitTravelTime lookup
def find_value(origin, destination, mode_num):
    # mode_num: 0 - to find Auto Travel Time
    #           1 - to find Transit Travel Time
    #           2 - to find Walking Distance
    #           3 - to find Auto Cost
    #           4 - to find Transit Cost
    try:
        i = origins.index(origin)
    except ValueError:
        not_found.add(origin)
        return 0
    try:
        j = origins.index(destination)
    except ValueError:
        not_found.add(destination)
        return 0
    
    if mode_num == 0:
        return AutoTravelTimes[i*2392 + j]
    elif mode_num == 1:
        return TransitTravelTimes[i*2392 + j]
    elif mode_num == 2:
        return dists[i*2392 + j] * 15/1000 # This was changed from walking distance to walking time
    elif mode_num == 3:
        return AutoCosts[i*2392 + j]
    elif mode_num == 4:
        return TransitCosts[i*2392 + j]
    else:
        print("ERROR: Enter correct mode_num!")
        return 0

'''
# List of campus' TTS zones from Joven's MOE data
campus_zones = list(campus_info['Zone'])
school_codes = list(campus_info['Code'])

# Load distances into dataframe
for i in range(len(campus_zones)):
    df["Dist." + school_codes[i]] = df['HomeZone'].apply(lambda x: find_distance(x, campus_zones[i]))
print("# of zones not found:", len(not_found))
'''

# Add columns for Distance, Transit Travel Time, Auto Travel Time, Auto Cost, Transit Cost
df['Time.Auto'] = df.apply(lambda x: find_value(x.HomeZone, x.Campus, 0), axis=1)
df['Time.Transit'] = df.apply(lambda x: find_value(x.HomeZone, x.Campus, 1), axis=1)
df['Time.Active'] = df.apply(lambda x: find_value(x.HomeZone, x.Campus, 2), axis=1)
#df['Cost.Auto'] = df.apply(lambda x: find_value(x.HomeZone, x.CampusZone, 3), axis=1)
#df['Cost.Transit'] = df.apply(lambda x: find_value(x.HomeZone, x.CampusZone, 4), axis=1)

df.head()

Unnamed: 0,Campus,Mode,HomeZone,Income,Time.Auto,Time.Transit,Time.Active
0,566,Transit,261,Unknown,17.47422,75.468478,223.2147
1,69,Active,71,"$ 90,000 - 119,999",2.924953,24.128386,16.985265
2,69,Transit,3714,Unknown,50.17188,155.55117,349.78845
3,69,Active,74,Unknown,1.049121,16.675728,10.49121
4,69,Active,71,"$ 30,000 - 59,999",2.924953,24.128386,16.985265


In [4]:
# Load enrollment data
enrollment_df = pd.read_csv('../../Data/Enrolment/Joven_Enrollment.csv').set_index('School')
def get_log_enrollment(level, school):
    return math.log1p(enrollment_df.loc[school][level]) # Level: UG, Grad, Total
    
for level in ('Total', 'UG', 'Grad'):
    for i in range(len(campus_zones)):
        df[level + "." + school_codes[i]] = get_log_enrollment(level, school_codes[i])

In [5]:
df = df[(df['Dist.SG'] > 0) | (df['Dist.SC'] > 0)]
df = df.drop(columns=['Family', 'Level', 'Status', 'Years', 'HomeZone'])
for i in range(7):
    for campus, count in df[df['Segment'] == i]['Campus'].value_counts().iteritems():
        df["S" + str(i) + "." + campus] = count

In [16]:
df

Unnamed: 0,Campus,Mode,HomeZone,Income,Time.Auto,Time.Transit,Time.Active
0,566,Transit,261,Unknown,17.474220,75.468478,223.214700
1,69,Active,71,"$ 90,000 - 119,999",2.924953,24.128386,16.985265
2,69,Transit,3714,Unknown,50.171880,155.551170,349.788450
3,69,Active,74,Unknown,1.049121,16.675728,10.491210
4,69,Active,71,"$ 30,000 - 59,999",2.924953,24.128386,16.985265
5,69,Active,72,Unknown,3.068723,25.742024,23.927640
6,566,Transit,600,Unknown,14.542720,53.745301,172.493400
7,566,Auto,3420,"$ 30,000 - 59,999",60.657420,213.667410,694.200900
8,69,Active,113,Unknown,15.759180,42.637287,93.900495
9,69,Transit,1031,Unknown,66.121790,117.968371,446.012700


In [24]:
# Remove columns we don't need anymore:
del df['Campus']
del df['HomeZone']

In [25]:
df.head()

Unnamed: 0,Mode,Income,Time.Auto,Time.Transit,Time.Active
0,Transit,Unknown,17.47422,75.468478,223.2147
1,Active,"$ 90,000 - 119,999",2.924953,24.128386,16.985265
2,Transit,Unknown,50.17188,155.55117,349.78845
3,Active,Unknown,1.049121,16.675728,10.49121
4,Active,"$ 30,000 - 59,999",2.924953,24.128386,16.985265


In [26]:
# Dataframe to .csv file:
#df.to_csv('../../R_Logit_Models/Mode Choice Models/MChInput_2015_withIncome.csv')
df.to_csv('../../../../../../../MChInput_2015_withIncome.csv')