In [None]:
import pandas as pd
import math
import csv

hh_df = pd.read_csv('SMTO_2015/SMTO_2015_Households.csv')
ps_df = pd.read_csv('SMTO_2015/SMTO_2015_Respondents.csv')
campus_info = pd.read_csv('SMTO_2015/Campus_Info.csv')

# List of campus' TTS zones from Joven's MOE data
campus_zones = list(campus_info['Zone'])
school_codes = list(campus_info['Code'])

# List of University Information from OUAC
uni_info = pd.read_csv('Uni_Info.csv')
uni_info = uni_info.set_index('University')

# Load relevant columns
df = ps_df[['pscampusattend', 'personstatusgrad', 'personstatustime','psmainmodefalltypical', 'pscmpgender','psdrivinglicenseownerflag','psworknumhoursperweek','psage','psexpfactor']]
df = df.join(hh_df[['HmTTS2006', 'hhlivingsituation', 'hhcarnumber','hhnumyoungerthan18','hhnumolderorequalto18','hhincomelevel']])

df = df.rename(columns={'HmTTS2006': 'HomeZone', 'pscampusattend': 'Campus', 'hhlivingsituation': 'Family', 'psmainmodefalltypical': 'Mode_Actual', 
                       'personstatusgrad': 'Level', 'personstatustime': 'Status', 'psuniversityinvolvednumyears': 'Years', 'hhcarnumber': 'Cars',
                       'hhincomelevel': 'Income', 'pscmpgender': 'Gender', 'psdrivinglicenseownerflag': 'Licence', 'psworknumhoursperweek': 'Work',
                       'hhnumyoungerthan18': 'Children', 'hhnumolderorequalto18':'Adults', 'psage': 'Age','psexpfactor': 'Exp_SMTO'})


mode_name_to_num = {"Car - Driver alone": "Auto", "Car - Driver with passenger(s)": "Auto", "Car - Passenger": "Auto", "Taxi": "Auto", 
                    "Transit Bus": "Transit", "Streetcar": "Transit", "Subway/RT": "Transit", "GO Bus": "Transit", "GO Train": "Transit", 
                    "Walk": "Active", "Bicycle": "Active"}
print(df.shape)

# Make Aggregate Mode Column
df['Mode'] = df['Mode_Actual'].replace(mode_name_to_num)
df = df[(df['Mode'] == "Auto") | (df['Mode'] == "Transit") | (df['Mode'] == "Active")]
df.loc[~df['Mode'].isin(('Auto', 'Transit', 'Active')), 'Mode'] = 'Other'
print(df.shape)

df['Work'].fillna('Unknown', inplace=True) # Fill 11,000 Work NaNs with "Unknowns"
df = df.dropna() # Remove rows with missing data
print(df.shape)

# Convert Campus column to numerical column
df['CampusZone'] = df["Campus"].apply(lambda x: campus_info.Zone[campus_info['School'] == x].tolist()[0])
print(df.shape)

# Convert columns to numerical
df['HomeZone'] = pd.to_numeric(df['HomeZone'], downcast='signed')

print(df.shape)

# Add column with school codes
df['School_Codes'] = df['Campus'].apply(lambda x: uni_info['Code'][x])

df.head()

In [None]:
df['Work'].value_counts()

In [None]:
# Column Transformation functions

def inc_transform(x):
    if x == "Less than $ 30,000":
        return "Low"
    elif x == "$ 30,000 - 59,999":
        return "Low"
    elif x == "$ 60,000 - 89,999":
        return "Low"
    elif x == "$ 90,000 - 119,999":
        return "High"
    elif x == "$ 120,000 - 149,999":
        return "High"
    elif x == "$ 150,000 - 179,999":
        return "High"
    elif x == "$ 180,000 - 209,999":
        return "High"
    elif x == "$ 210,000 - 239,999":
        return "High"
    elif x == "$ 240,000 +":
        return "High"
    else:
        return "Unknown"

def log_inc(x):
    if x == "Less than $ 30,000":
        return math.log1p(15000)
    elif x == "$ 30,000 - 59,999":
        return math.log1p(45000)
    elif x == "$ 60,000 - 89,999":
        return math.log1p(75000)
    elif x == "$ 90,000 - 119,999":
        return math.log1p(105000)
    elif x == "$ 120,000 - 149,999":
        return math.log1p(135000)
    elif x == "$ 150,000 - 179,999":
        return math.log1p(165000)
    elif x == "$ 180,000 - 209,999":
        return math.log1p(195000)
    elif x == "$ 210,000 - 239,999":
        return math.log1p(225000)
    elif x == "$ 240,000 +":
        return math.log1p(255000)
    else: # Unknown
        return 0

    
def work_transform(x):
    if x == "Yes, I work > 40 hours per week":
        return "FT"
    elif x == "Yes, I work 31-40 hours per week":
        return "FT"
    elif x == "Yes, I work part time (21-30 hours per week)":
        return "PT"
    elif x == "Yes, I work part time (11-20 hours per week)":
        return "PT"
    elif x == "Yes, I work part time (<10 hours per week)":
        return "PT"
    elif x == "No, I don't work":
        return "NW"
    else:
        return "Unknown"
    
def fam_transform(x):
    if x == "Live with family/parents":
        return 1
    else:
        return 0
'''
def car_transform(x):
    if x == 0:
        return 0
    elif x == 1: 
        return 1
    else:
        return 2
'''

In [None]:
df['Work'] = df["Work"].apply(lambda x: work_transform(x))
df['Family'] = df["Family"].apply(lambda x: fam_transform(x))
#df['Cars'] = df["Cars"].apply(lambda x: car_transform(x))
df['LogIncome'] = df["Income"].apply(log_inc)
df['Income'] = df["Income"].apply(lambda x: inc_transform(x))
df.head()

In [None]:
def row_to_segment(x):
    
    return {((x.Level == 'UG') & (x.Status == 'FT') & x.Family): 1,
            ((x.Level == 'UG') & (x.Status == 'FT') & (not x.Family)): 2,
            ((x.Level == 'UG') & (x.Status == 'PT')): 3,
            ((x.Level == 'Grad') & (x.Status == 'FT') & x.Family): 4,
            ((x.Level == 'Grad') & (x.Status == 'FT') & (not x.Family)): 5,
            ((x.Level == 'Grad') & (x.Status == 'PT')): 6}.get(True, 0)

df['Segment'] = df.apply(row_to_segment, axis=1)

df.head()

In [None]:
# Dataframe with walk distances
df_path = pd.read_csv('../../LoS/Walk_Distances.csv')
origins = list(set(list(df_path['Origin'])))
dists = list(df_path['Data'])

# Dataframe with AutoTravelTimes
df_att = pd.read_csv('../../LoS/Auto_Travel_Times.csv')
AutoTravelTimes = list(df_att['Data'])

# Dataframe with TransitTravelTimes
df_ttt = pd.read_csv('../../LoS/Transit_Travel_Times.csv')
TransitTravelTimes = list(df_ttt['Data'])

# Function for distance lookup
not_found = set()
def find_distance(origin, destination, info_num):
    # mode_num: 0 - to find Walking Distance
    #           1 - to find Walking Time
    #           2 - to find Auto Time  
    #           3 - to find Transit Time
    
    try:
        i = origins.index(origin)
    except ValueError:
        not_found.add(origin)
        return -1
    try:
        j = origins.index(destination)
    except ValueError:
        not_found.add(destination)
        return -1
    
    if info_num == 0:
        return dists[i*2392 + j] / 1000
    elif info_num == 1:
        return dists[i*2392 + j] * 15 / 1000 
    elif info_num == 2:
        return AutoTravelTimes[i*2392 + j]
    elif info_num == 3:
        return TransitTravelTimes[i*2392 + j]
    else:
        print("ERROR: Enter correct info_num!")
        return False

# Load distances into dataframe
for i in range(len(campus_zones)):
    df["Dist." + school_codes[i]] = df['HomeZone'].apply(lambda x: find_distance(x, campus_zones[i],0))
for i in range(len(campus_zones)):
    df["WTT." + school_codes[i]] = df['HomeZone'].apply(lambda x: find_distance(x, campus_zones[i],1))
for i in range(len(campus_zones)):
    df["AIVTT." + school_codes[i]] = df['HomeZone'].apply(lambda x: find_distance(x, campus_zones[i],2))
for i in range(len(campus_zones)):
    df["TPTT." + school_codes[i]] = df['HomeZone'].apply(lambda x: find_distance(x, campus_zones[i],3))
    
print("# of zones not found:", len(not_found))

df.head()

In [None]:
# Load enrollment data
enrollment_df = pd.read_csv('Enrolment/Joven_Enrollment.csv').set_index('School')
def get_log_enrollment(level, school):
    return math.log1p(enrollment_df.loc[school][level]) # Level: UG, Grad, Total
    
for level in ('Total', 'UG', 'Grad'):
    for i in range(len(campus_zones)):
        df[level + "." + school_codes[i]] = get_log_enrollment(level, school_codes[i])

In [None]:
df.head()

In [None]:
# One column per campus tuition
for i in range(len(campus_zones)):
    df["Tuition." + school_codes[i]] = uni_info['Tuition'][i]
    
# One single tuition col
#df['Tuition'] = df['Campus'].apply(lambda x: uni_info.loc[x]['Tuition'])

In [None]:
# Domestic Percentage Column
for i in range(len(campus_zones)):
    df["Domestic." + school_codes[i]] = df['Level'].apply(lambda x: uni_info['Domestic%_' + ('Grad' if (x == 'Grad') else 'UG')][i])

In [None]:
for i in range(len(campus_zones)):
    df["Admission_Avg." + school_codes[i]] = uni_info['Admission'][i]

In [None]:
list(uni_info.index)

In [None]:
enrollment_df

In [None]:
exp_factors = {}
schoolnames = list(uni_info.index)
for i in range(len(campus_zones)):
    exp_factors[schoolnames[i]] = [enrollment_df.loc[school_codes[i]]['UG'] / enrollment_df['UG'].sum() / df[df['Level'] == 'UG']['Campus'].value_counts(normalize=True)[i]]
    exp_factors[schoolnames[i]].append(enrollment_df.loc[school_codes[i]]['Grad'] / enrollment_df['Grad'].sum() / df[df['Level'] == 'Grad']['Campus'].value_counts(normalize=True)[i]) 
    exp_factors[schoolnames[i]].append(enrollment_df.loc[school_codes[i]]['Total'] / enrollment_df['Total'].sum() / df[df['Level'] == 'Other']['Campus'].value_counts(normalize=True)[i])
    
def load_exp_factors_segment(x):
    if x.Level == 'Other':
        return x.Exp_SMTO
    return exp_factors[x.Campus][0 if x.Level == 'UG' else 1]

def load_exp_factors_level(x):
    return {x.Level == 'UG': exp_factors[x.Campus][0], x.Level == 'Grad': exp_factors[x.Campus][1]}.get(True, exp_factors[x.Campus][2])

df['Exp_Segment'] = df.apply(load_exp_factors_segment, axis = 1)
df['Exp_Level'] = df.apply(load_exp_factors_level, axis = 1)

In [None]:
df.head()

In [None]:
# Remove columns we don't need anymore:
del df['CampusZone']
del df['HomeZone']
del df['Exp_SMTO']

In [None]:
for i in school_codes:
    df = df[df['WTT.' + i] != -1]
    df = df[df['AIVTT.' + i] < 1000]

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df['Segment'].value_counts()

In [None]:
df.to_csv('SMTO_2015/SMTO_2015_Complete_Input.csv', index = False)