In [1]:
import pandas as pd
import csv

hh_df = pd.read_csv('../../../Data/SMTO_2015/SMTO_2015_Households.csv')
ps_df = pd.read_csv('../../../Data/SMTO_2015/SMTO_2015_Respondents.csv')
campus_info = pd.read_csv('Campus_Info.csv')

# Load relevant columns
df = ps_df[['pscampusattend', 'personstatusgrad', 'personstatustime','psmainmodefalltypical', 'pscmpgender','psdrivinglicenseownerflag','psworknumhoursperweek','psborrowcar']]
df = df.join(hh_df[['HmTTS2006', 'hhlivingsituation', 'hhcarnumber','hhnumyoungerthan18','hhnumolderorequalto18','hhincomelevel']])

# Rename columns
df = df.rename(columns={'HmTTS2006': 'HomeZone', 'pscampusattend': 'Campus', 'hhlivingsituation': 'Family', 'psmainmodefalltypical': 'Mode', 
                       'personstatusgrad': 'Level', 'personstatustime': 'Status', 'psuniversityinvolvednumyears': 'Years', 'hhcarnumber': 'Cars',
                       'hhincomelevel': 'Income', 'pscmpgender': 'Gender', 'psdrivinglicenseownerflag': 'Licence', 'psworknumhoursperweek': 'Work',
                       'hhnumyoungerthan18': 'Children', 'hhnumolderorequalto18':'Adults', 'psborrowcar': 'Car_Avail'})

# Convert modes to three choices
mode_name_to_num = {"Car - Driver alone": "Auto", "Car - Driver with passenger(s)": "Auto", "Car - Passenger": "Auto", "Taxi": "Auto", 
                    "Transit Bus": "Transit", "Streetcar": "Transit", "Subway/RT": "Transit", "GO Bus": "Transit", "GO Train": "Transit", 
                    "Walk": "Active", "Bicycle": "Active"}
df.replace({'Mode': mode_name_to_num}, inplace=True)

# Handle NaNs
df = df[(df['Mode'] == "Auto") | (df['Mode'] == "Transit") | (df['Mode'] == "Active")]
df['Work'].fillna('Unknown', inplace=True) # Fill 11,000 Work NaNs with "Unknowns"
df['Car_Avail'].fillna(0, inplace = True)
df = df.dropna() # Remove rows with missing data

# Convert columns to numerical
df['Campus'] = df['Campus'].apply(lambda x: campus_info.Zone[campus_info['School'] == x].tolist()[0])
df['HomeZone'] = pd.to_numeric(df['HomeZone'], downcast='signed')
df['Car_Avail'] = pd.to_numeric(df['Car_Avail'], downcast='signed')

df.head()

Unnamed: 0,Campus,Level,Status,Mode,Gender,Licence,Work,Car_Avail,HomeZone,Family,Cars,Children,Adults,Income
0,566,UG,FT,Transit,Female,0,Unknown,0,261,Live with family/parents,1,3,4,Unknown
1,69,Grad,FT,Active,Female,1,Unknown,0,71,Live with partner,0,0,2,"$ 90,000 - 119,999"
2,69,UG,FT,Transit,Female,1,Unknown,1,3714,Live with family/parents,1,0,4,Unknown
3,69,UG,FT,Active,Male,1,Unknown,1,74,Live with roommates,0,0,4,Unknown
4,69,Grad,FT,Active,Male,1,Unknown,0,71,Live with partner,0,0,2,"$ 30,000 - 59,999"


In [2]:
# Column Transformation functions

def inc_transform(x):
    if x == "Less than $ 30,000":
        return "Low"
    elif x == "$ 30,000 - 59,999":
        return "Low"
    elif x == "$ 60,000 - 89,999":
        return "Low"
    elif x == "$ 90,000 - 119,999":
        return "High"
    elif x == "$ 120,000 - 149,999":
        return "High"
    elif x == "$ 150,000 - 179,999":
        return "High"
    elif x == "$ 180,000 - 209,999":
        return "High"
    elif x == "$ 210,000 - 239,999":
        return "High"
    elif x == "$ 240,000 +":
        return "High"
    else:
        return "Unknown"
   
    
def work_transform(x):
    if x == "Yes, I work > 40 hours per week":
        return "FT"
    elif x == "Yes, I work 31-40 hours per week":
        return "FT"
    elif x == "Yes, I work part time (21-30 hours per week)":
        return "PT"
    elif x == "Yes, I work part time (11-20 hours per week)":
        return "PT"
    elif x == "Yes, I work part time (<10 hours per week)":
        return "PT"
    else:
        return "Other"
    
def fam_transform(x):
    return "Family" if x == "Live with family/parents" else "Other"

df['Work'] = df["Work"].apply(lambda x: work_transform(x))
df['Family'] = df["Family"].apply(lambda x: fam_transform(x))
df['Income'] = df["Income"].apply(lambda x: inc_transform(x))

In [3]:
# Segmentation function
def row_to_segment(x):
    if x.Level == 'Other':
        return 0
    elif x.Level == 'UG':
        return 3 if x.Status == 'PT' else (not x.Family) + 1
    else:
        return 6 if x.Status == 'PT' else (4 + (not x.Family))

df['Segment'] = df.apply(row_to_segment, axis=1)

In [4]:
df.head()

Unnamed: 0,Campus,Level,Status,Mode,Gender,Licence,Work,Car_Avail,HomeZone,Family,Cars,Children,Adults,Income,Segment
0,566,UG,FT,Transit,Female,0,Other,0,261,Family,1,3,4,Unknown,1
1,69,Grad,FT,Active,Female,1,Other,0,71,Other,0,0,2,High,4
2,69,UG,FT,Transit,Female,1,Other,1,3714,Family,1,0,4,Unknown,1
3,69,UG,FT,Active,Male,1,Other,1,74,Other,0,0,4,Unknown,1
4,69,Grad,FT,Active,Male,1,Other,0,71,Other,0,0,2,Low,4


In [5]:
# Load LoS Data
df_path = pd.read_csv('../../../../LoS/Walk_Distances.csv')
origins, dists = list(set(list(df_path['Origin']))), list(df_path['Data'])
AutoTravelTimes = list(pd.read_csv('../../../../LoS/Auto_Travel_Times.csv')['Data'])
TransitTravelTimes = list(pd.read_csv('../../../../LoS/Transit_Travel_Times.csv')['Data'])
AutoCosts = list(pd.read_csv('../../../../LoS/Auto_Cost.csv')['Data'])
TransitCosts = list(pd.read_csv('../../../../LoS/Transit_Cost.csv')['Data'])

In [6]:
def load_columns(o, d):
    try:
        i = origins.index(o)
    except ValueError:
        return (-1, -1, -1, -1, -1)
    j = origins.index(d)
    return (AutoTravelTimes[i*2392 + j], TransitTravelTimes[i*2392 + j],
           dists[i*2392 + j] * 15/1000, AutoCosts[i*2392 + j], TransitCosts[i*2392 + j])

In [7]:
df = df.reset_index()
df = pd.concat([df, pd.DataFrame([load_columns(x, y) for x, y in zip(df['HomeZone'], df['Campus'])], columns = ['Time.Auto', 'Time.Transit', 'Time.Active', 'Cost.Auto', 'Cost.Transit'])], axis=1, join='inner')
df = df[df['Time.Active'] != -1]
df = df[df['Time.Auto'] < 1000]
df = df.reset_index()
df = df.drop(columns=['level_0', 'index', 'Gender', 'Campus', 'Status', 'Level', 'HomeZone', 'Work', 'Car_Avail', 'Income', 'Segment', 'Children', 'Adults'])
df.head()

Unnamed: 0,Mode,Licence,Family,Cars,Time.Auto,Time.Transit,Time.Active,Cost.Auto,Cost.Transit
0,Transit,0,Family,1,17.47422,75.468478,223.2147,1.00664,2.22
1,Active,1,Other,0,2.924953,24.128386,16.985265,0.105459,2.22
2,Transit,1,Family,1,50.17188,155.55117,349.78845,1.751424,4.415403
3,Active,1,Other,0,1.049121,16.675728,10.49121,0.047282,2.22
4,Active,1,Other,0,2.924953,24.128386,16.985265,0.105459,2.22


In [8]:
# Dataframe to .csv file:
df.to_csv('Mode_Choice_Cost_Input.csv')