In [163]:
import pandas as pd
import math
import csv

hh_df = pd.read_csv('../../../Data/SMTO_2015/SMTO_2015_Households.csv')
ps_df = pd.read_csv('../../../Data/SMTO_2015/SMTO_2015_Respondents.csv')
campus_info = pd.read_csv('../../../Data/SMTO_2015/Campus_Info.csv')

# Load relevant columns
df = ps_df[['pscampusattend', 'personstatusgrad', 'personstatustime','psmainmodefalltypical', 'pscmpgender','psdrivinglicenseownerflag','psworknumhoursperweek','psborrowcar']]
df = df.join(hh_df[['HmTTS2006', 'hhlivingsituation', 'hhcarnumber','hhnumyoungerthan18','hhnumolderorequalto18','hhincomelevel']])

#df = ps_df[['pscampusattend','psmainmodefalltypical']]
#df = df.join(hh_df[['HmTTS2006','hhincomelevel']])

df = df.rename(columns={'HmTTS2006': 'HomeZone', 'pscampusattend': 'Campus', 'hhlivingsituation': 'Family', 'psmainmodefalltypical': 'Mode', 
                       'personstatusgrad': 'Level', 'personstatustime': 'Status', 'psuniversityinvolvednumyears': 'Years', 'hhcarnumber': 'Cars',
                       'hhincomelevel': 'Income', 'pscmpgender': 'Gender', 'psdrivinglicenseownerflag': 'Licence', 'psworknumhoursperweek': 'Work',
                       'hhnumyoungerthan18': 'Children', 'hhnumolderorequalto18':'Adults', 'psborrowcar': 'Car_Avail'})

#df = df.rename(columns={'HmTTS2006': 'HomeZone', 'pscampusattend': 'Campus', 'psmainmodefalltypical': 'Mode', 'hhincomelevel': 'Income'})

mode_name_to_num = {"Car - Driver alone": "Auto", "Car - Driver with passenger(s)": "Auto", "Car - Passenger": "Auto", "Taxi": "Auto", 
                    "Transit Bus": "Transit", "Streetcar": "Transit", "Subway/RT": "Transit", "GO Bus": "Transit", "GO Train": "Transit", 
                    "Bicycle": "Active", 'Walk': 'Active'}
print(df.shape)

df.replace({'Mode': mode_name_to_num}, inplace=True)
df = df[(df['Mode'] == "Auto") | (df['Mode'] == "Transit") | (df['Mode'] == "Active")]
print(df.shape)

df['Work'].fillna('Unknown', inplace=True) # Fill 11,000 Work NaNs with "Unknowns"
df['Car_Avail'].fillna(0, inplace = True)
df = df.dropna() # Remove rows with missing data
print(df.shape)


# Convert Campus column to numerical column
df['Campus'] = df["Campus"].apply(lambda x: campus_info.Zone[campus_info['School'] == x].tolist()[0])
print(df.shape)

# Convert columns to numerical
df['HomeZone'] = pd.to_numeric(df['HomeZone'], downcast='signed')
df['Car_Avail'] = pd.to_numeric(df['Car_Avail'], downcast='signed')

print(df.shape)
df.head()


(15226, 14)
(15066, 14)
(14839, 14)
(14839, 14)
(14839, 14)


Unnamed: 0,Campus,Level,Status,Mode,Gender,Licence,Work,Car_Avail,HomeZone,Family,Cars,Children,Adults,Income
0,566,UG,FT,Transit,Female,0,Unknown,0,261,Live with family/parents,1,3,4,Unknown
1,69,Grad,FT,Active,Female,1,Unknown,0,71,Live with partner,0,0,2,"$ 90,000 - 119,999"
2,69,UG,FT,Transit,Female,1,Unknown,1,3714,Live with family/parents,1,0,4,Unknown
3,69,UG,FT,Active,Male,1,Unknown,1,74,Live with roommates,0,0,4,Unknown
4,69,Grad,FT,Active,Male,1,Unknown,0,71,Live with partner,0,0,2,"$ 30,000 - 59,999"


In [164]:
df['Mode'].value_counts()

Transit    9533
Active     3888
Auto       1418
Name: Mode, dtype: int64

In [165]:
# Column Transformation functions

def inc_transform(x):
    if x == "Less than $ 30,000":
        return "Low"
    elif x == "$ 30,000 - 59,999":
        return "Low"
    elif x == "$ 60,000 - 89,999":
        return "Low"
    elif x == "$ 90,000 - 119,999":
        return "High"
    elif x == "$ 120,000 - 149,999":
        return "High"
    elif x == "$ 150,000 - 179,999":
        return "High"
    elif x == "$ 180,000 - 209,999":
        return "High"
    elif x == "$ 210,000 - 239,999":
        return "High"
    elif x == "$ 240,000 +":
        return "High"
    else:
        return "Unknown"
   
    
def work_transform(x):
    if x == "Yes, I work > 40 hours per week":
        return "FT"
    elif x == "Yes, I work 31-40 hours per week":
        return "FT"
    elif x == "Yes, I work part time (21-30 hours per week)":
        return "PT"
    elif x == "Yes, I work part time (11-20 hours per week)":
        return "PT"
    elif x == "Yes, I work part time (<10 hours per week)":
        return "PT"
    else:
        return "Other"
    
def fam_transform(x):
    if x == "Live with family/parents":
        return "Family"
    else:
        return "Other"

def car_transform(x):
    if x == 0:
        return 0
    elif x == 1: 
        return 1
    else:
        return 2

In [174]:
df['Work'] = df["Work"].apply(lambda x: work_transform(x))
df['Family'] = df["Family"].apply(lambda x: fam_transform(x))
df['Cars'] = df["Cars"].apply(lambda x: car_transform(x))
df['Income'] = df["Income"].apply(lambda x: inc_transform(x))

In [166]:
df.head()

Unnamed: 0,Campus,Level,Status,Mode,Gender,Licence,Work,Car_Avail,HomeZone,Family,Cars,Children,Adults,Income
0,566,UG,FT,Transit,Female,0,Unknown,0,261,Live with family/parents,1,3,4,Unknown
1,69,Grad,FT,Active,Female,1,Unknown,0,71,Live with partner,0,0,2,"$ 90,000 - 119,999"
2,69,UG,FT,Transit,Female,1,Unknown,1,3714,Live with family/parents,1,0,4,Unknown
3,69,UG,FT,Active,Male,1,Unknown,1,74,Live with roommates,0,0,4,Unknown
4,69,Grad,FT,Active,Male,1,Unknown,0,71,Live with partner,0,0,2,"$ 30,000 - 59,999"


In [2]:
def row_to_segment(x):
    if x.Level == 'Other':
        return 0
    elif x.Level == 'UG':
        return 3 if x.Status == 'PT' else (not x.Family) + 1
    else:
        return 6 if x.Status == 'PT' else (4 + (not x.Family))

df['Segment'] = df.apply(row_to_segment, axis=1)

In [167]:
# Dataframe with walk distances
df_path = pd.read_csv('../../../../LoS/Walk_Distances.csv')
origins = list(set(list(df_path['Origin'])))
dists = list(df_path['Data'])

# Dataframe with AutoTravelTimes
df_att = pd.read_csv('../../../../LoS/Auto_Travel_Times.csv')
AutoTravelTimes = list(df_att['Data'])

# Dataframe with TransitTravelTimes
df_ttt = pd.read_csv('../../../../LoS/Transit_Travel_Times.csv')
TransitTravelTimes = list(df_ttt['Data'])

# Dataframe with TransitTravelTimes
df_ac = pd.read_csv('../../../../LoS/Auto_Cost.csv')
AutoCosts = list(df_ac['Data'])

# Dataframe with TransitTravelTimes
df_tc = pd.read_csv('../../../../LoS/Transit_Cost.csv')
TransitCosts = list(df_tc['Data'])

not_found = set()

# Function for distance/AutoTravelTime/TransitTravelTime lookup
def find_value(origin, destination, mode_num):
    # mode_num: 0 - to find Auto Travel Time
    #           1 - to find Transit Travel Time
    #           2 - to find Walking Time
    #           3 - to find Auto Cost
    #           4 - to find Transit Cost
    #           5 - to find Biking Time
    try:
        i = origins.index(origin)
    except ValueError:
        not_found.add(origin)
        return -1
    try:
        j = origins.index(destination)
    except ValueError:
        not_found.add(destination)
        return -1
    
    if mode_num == 0:
        return AutoTravelTimes[i*2392 + j]
    elif mode_num == 1:
        return TransitTravelTimes[i*2392 + j]
    elif mode_num == 2:
        return dists[i*2392 + j] * 15/1000 # This was changed from walking distance to walking time
    elif mode_num == 3:
        return AutoCosts[i*2392 + j]
    elif mode_num == 4:
        return TransitCosts[i*2392 + j]
    elif mode_num == 5:
        return dists[i*2392 + j] * 4/1000 # This was changed from walking distance to biking time
    else:
        print("ERROR: Enter correct mode_num!")
        return 0

'''
# List of campus' TTS zones from Joven's MOE data
campus_zones = list(campus_info['Zone'])
school_codes = list(campus_info['Code'])

# Load distances into dataframe
for i in range(len(campus_zones)):
    df["Dist." + school_codes[i]] = df['HomeZone'].apply(lambda x: find_distance(x, campus_zones[i]))
print("# of zones not found:", len(not_found))
'''

'\n# List of campus\' TTS zones from Joven\'s MOE data\ncampus_zones = list(campus_info[\'Zone\'])\nschool_codes = list(campus_info[\'Code\'])\n\n# Load distances into dataframe\nfor i in range(len(campus_zones)):\n    df["Dist." + school_codes[i]] = df[\'HomeZone\'].apply(lambda x: find_distance(x, campus_zones[i]))\nprint("# of zones not found:", len(not_found))\n'

In [168]:
# Add columns for Distance, Transit Travel Time, Auto Travel Time, Auto Cost, Transit Cost
df['Time.Auto'] = df.apply(lambda x: find_value(x.HomeZone, x.Campus, 0), axis=1)
df['Time.Transit'] = df.apply(lambda x: find_value(x.HomeZone, x.Campus, 1), axis=1)

'''
active = []
homes = list(df['HomeZone'])
schools = list(df['Campus'])
for i in range(len(df['Mode'])):
    if list(df['Mode'])[i] == 'Bike':
        active.append(find_value(homes[i], schools[i], 5))
    else:
        active.append(find_value(homes[i], schools[i], 2))
df['Time.Active'] = active
'''

df['Time.Active'] = df.apply(lambda x: find_value(x.HomeZone, x.Campus, 2), axis=1)

#df['Time.Bike'] = df.apply(lambda x: find_value(x.HomeZone, x.Campus, 5), axis=1)
#df['Time.Active'] = df.apply(lambda x: find_value(x.HomeZone, x.Campus, 2), axis=1)
#df['Cost.Auto'] = df.apply(lambda x: find_value(x.HomeZone, x.CampusZone, 3), axis=1)
#df['Cost.Transit'] = df.apply(lambda x: find_value(x.HomeZone, x.CampusZone, 4), axis=1)

df.head()

Unnamed: 0,Campus,Level,Status,Mode,Gender,Licence,Work,Car_Avail,HomeZone,Family,Cars,Children,Adults,Income,Time.Auto,Time.Transit,Time.Active
0,566,UG,FT,Transit,Female,0,Unknown,0,261,Live with family/parents,1,3,4,Unknown,17.47422,75.468478,223.2147
1,69,Grad,FT,Active,Female,1,Unknown,0,71,Live with partner,0,0,2,"$ 90,000 - 119,999",2.924953,24.128386,16.985265
2,69,UG,FT,Transit,Female,1,Unknown,1,3714,Live with family/parents,1,0,4,Unknown,50.17188,155.55117,349.78845
3,69,UG,FT,Active,Male,1,Unknown,1,74,Live with roommates,0,0,4,Unknown,1.049121,16.675728,10.49121
4,69,Grad,FT,Active,Male,1,Unknown,0,71,Live with partner,0,0,2,"$ 30,000 - 59,999",2.924953,24.128386,16.985265


In [113]:
'''
# Load enrollment data
enrollment_df = pd.read_csv('../../Data/Enrolment/Joven_Enrollment.csv').set_index('School')
def get_log_enrollment(level, school):
    return math.log1p(enrollment_df.loc[school][level]) # Level: UG, Grad, Total
    
for level in ('Total', 'UG', 'Grad'):
    for i in range(len(campus_zones)):
        df[level + "." + school_codes[i]] = get_log_enrollment(level, school_codes[i])
'''

'\n# Load enrollment data\nenrollment_df = pd.read_csv(\'../../Data/Enrolment/Joven_Enrollment.csv\').set_index(\'School\')\ndef get_log_enrollment(level, school):\n    return math.log1p(enrollment_df.loc[school][level]) # Level: UG, Grad, Total\n    \nfor level in (\'Total\', \'UG\', \'Grad\'):\n    for i in range(len(campus_zones)):\n        df[level + "." + school_codes[i]] = get_log_enrollment(level, school_codes[i])\n'

In [5]:
df = df[(df['Dist.SG'] > 0) | (df['Dist.SC'] > 0)]
df = df.drop(columns=['Family', 'Level', 'Status', 'Years', 'HomeZone'])
for i in range(7):
    for campus, count in df[df['Segment'] == i]['Campus'].value_counts().iteritems():
        df["S" + str(i) + "." + campus] = count

In [101]:
df

Unnamed: 0,Level,Status,Mode,Gender,Licence,Work,Car_Avail,Family,Cars,Children,Adults,Income,Time.Auto,Time.Transit,Time.Walk,Time.Bike
0,UG,FT,Transit,Female,0,Other,0,Family,1,3,4,Unknown,17.474220,75.468478,223.214700,59.523920
1,Grad,FT,Active,Female,1,Other,0,Other,0,0,2,High,2.924953,24.128386,16.985265,4.529404
2,UG,FT,Transit,Female,1,Other,1,Family,1,0,4,Unknown,50.171880,155.551170,349.788450,93.276920
3,UG,FT,Active,Male,1,Other,1,Other,0,0,4,Unknown,1.049121,16.675728,10.491210,2.797656
4,Grad,FT,Active,Male,1,Other,0,Other,0,0,2,Low,2.924953,24.128386,16.985265,4.529404
5,UG,FT,Active,Female,0,PT,0,Other,0,0,3,Unknown,3.068723,25.742024,23.927640,6.380704
6,UG,FT,Transit,Female,0,Other,0,Family,2,1,4,Unknown,14.542720,53.745301,172.493400,45.998240
7,UG,FT,Auto,Female,1,Other,0,Family,2,0,5,Low,60.657420,213.667410,694.200900,185.120240
8,Grad,FT,Bike,Female,1,Other,0,Other,1,0,2,Unknown,15.759180,42.637287,93.900495,25.040132
9,Grad,FT,Transit,Female,1,Other,0,Other,1,0,2,Unknown,66.121790,117.968371,446.012700,118.936720


In [86]:
#mode_name_to_num = {"Bike": "Active"}
#df.replace({'Mode': mode_name_to_num}, inplace=True)

In [169]:
df["Mode"].value_counts()

Transit    9533
Active     3888
Auto       1418
Name: Mode, dtype: int64

In [170]:
# Remove columns we don't need anymore:
del df['Campus']
del df['HomeZone']

In [171]:
df = df[df['Time.Active'] != -1]

In [172]:
df = df[((df['Mode'] == 'Active') & (df['Time.Active'] <= 75)) | (df['Mode'] == 'Auto') | (df['Mode'] == 'Transit') ]

In [175]:
df.head()

Unnamed: 0,Level,Status,Mode,Gender,Licence,Work,Car_Avail,Family,Cars,Children,Adults,Income,Time.Auto,Time.Transit,Time.Active
0,UG,FT,Transit,Female,0,Other,0,Family,1,3,4,Unknown,17.47422,75.468478,223.2147
1,Grad,FT,Active,Female,1,Other,0,Other,0,0,2,High,2.924953,24.128386,16.985265
2,UG,FT,Transit,Female,1,Other,1,Family,1,0,4,Unknown,50.17188,155.55117,349.78845
3,UG,FT,Active,Male,1,Other,1,Other,0,0,4,Unknown,1.049121,16.675728,10.49121
4,Grad,FT,Active,Male,1,Other,0,Other,0,0,2,Low,2.924953,24.128386,16.985265


In [176]:
df.shape

(14344, 15)

In [177]:
# Dataframe to .csv file:
df.to_csv('Input_Final_Filtered.csv', index = False)