In [2]:
import pandas as pd
import math
import csv

In [116]:
hh_df = pd.read_csv('../GitHub_PORPOS/PORPOS-TMG/Data/SMTO_2015/SMTO_2015_Households.csv')
ps_df = pd.read_csv('../GitHub_PORPOS/PORPOS-TMG/Data/SMTO_2015/SMTO_2015_Respondents.csv')
campus_info = pd.read_csv('../GitHub_PORPOS/PORPOS-TMG/Data/SMTO_2015/Campus_Info.csv')

# List of campus' TTS zones from Joven's MOE data
campus_zones = list(campus_info['Zone'])
school_codes = list(campus_info['Code'])

# List of University Information from OUAC
uni_info = pd.read_csv('../GitHub_PORPOS/PORPOS-TMG/Data/Uni_Info.csv')
uni_info = uni_info.set_index('University')

# Load relevant columns
df = ps_df[['pscampusattend', 'personstatusgrad', 'personstatustime','psmainmodefalltypical', 'pscmpgender','psdrivinglicenseownerflag','psworknumhoursperweek','psage','psexpfactor']]
df = df.join(hh_df[['HmTTS2006', 'hhlivingsituation', 'hhcarnumber','hhnumyoungerthan18','hhnumolderorequalto18','hhincomelevel']])

df = df.rename(columns={'HmTTS2006': 'HomeZone', 'pscampusattend': 'Campus', 'hhlivingsituation': 'Family', 'psmainmodefalltypical': 'Mode_Actual', 
                       'personstatusgrad': 'Level', 'personstatustime': 'Status', 'psuniversityinvolvednumyears': 'Years', 'hhcarnumber': 'Cars',
                       'hhincomelevel': 'Income', 'pscmpgender': 'Gender', 'psdrivinglicenseownerflag': 'Licence', 'psworknumhoursperweek': 'Work',
                       'hhnumyoungerthan18': 'Children', 'hhnumolderorequalto18':'Adults', 'psage': 'Age','psexpfactor': 'Exp_SMTO'})


mode_name_to_num = {"Car - Driver alone": "Auto", "Car - Driver with passenger(s)": "Auto", "Car - Passenger": "Auto", "Taxi": "Auto", 
                    "Transit Bus": "Transit", "Streetcar": "Transit", "Subway/RT": "Transit", "GO Bus": "Transit", "GO Train": "Transit", 
                    "Walk": "Active", "Bicycle": "Active"}
print(df.shape)

# Make Aggregate Mode Column
df['Mode'] = df['Mode_Actual'].replace(mode_name_to_num)
df = df[(df['Mode'] == "Auto") | (df['Mode'] == "Transit") | (df['Mode'] == "Active")]
print(df.shape)

df['Work'].fillna('Unknown', inplace=True) # Fill 11,000 Work NaNs with "Unknowns"
df = df.dropna() # Remove rows with missing data
print(df.shape)

# Convert Campus column to numerical column
df['CampusZone'] = df["Campus"].apply(lambda x: campus_info.Zone[campus_info['School'] == x].tolist()[0])
print(df.shape)

# Convert columns to numerical
df['HomeZone'] = pd.to_numeric(df['HomeZone'], downcast='signed')

print(df.shape)

# Add column with school codes
df['School_Codes'] = df['Campus'].apply(lambda x: uni_info['Code'][x])

df.head()

(15226, 15)
(15066, 16)
(14839, 16)
(14839, 17)
(14839, 17)


Unnamed: 0,Campus,Level,Status,Mode_Actual,Gender,Licence,Work,Age,Exp_SMTO,HomeZone,Family,Cars,Children,Adults,Income,Mode,CampusZone,School_Codes
0,Scarborough (UTSC),UG,FT,Transit Bus,Female,0,Unknown,20,9.7,261,Live with family/parents,1,3,4,Unknown,Transit,566,SC
1,Downtown Toronto (St. George),Grad,FT,Walk,Female,1,Unknown,25,5.79,71,Live with partner,0,0,2,"$ 90,000 - 119,999",Active,69,SG
2,Downtown Toronto (St. George),UG,FT,Transit Bus,Female,1,Unknown,23,9.06,3714,Live with family/parents,1,0,4,Unknown,Transit,69,SG
3,Downtown Toronto (St. George),UG,FT,Walk,Male,1,Unknown,20,14.67,74,Live with roommates,0,0,4,Unknown,Active,69,SG
4,Downtown Toronto (St. George),Grad,FT,Walk,Male,1,Unknown,27,9.11,71,Live with partner,0,0,2,"$ 30,000 - 59,999",Active,69,SG


In [117]:
# Column Transformation functions

def inc_transform(x):
    if x == "Less than $ 30,000":
        return "Low"
    elif x == "$ 30,000 - 59,999":
        return "Low"
    elif x == "$ 60,000 - 89,999":
        return "Low"
    elif x == "$ 90,000 - 119,999":
        return "High"
    elif x == "$ 120,000 - 149,999":
        return "High"
    elif x == "$ 150,000 - 179,999":
        return "High"
    elif x == "$ 180,000 - 209,999":
        return "High"
    elif x == "$ 210,000 - 239,999":
        return "High"
    elif x == "$ 240,000 +":
        return "High"
    else:
        return "Unknown"
   
    
def work_transform(x):
    if x == "Yes, I work > 40 hours per week":
        return "FT"
    elif x == "Yes, I work 31-40 hours per week":
        return "FT"
    elif x == "Yes, I work part time (21-30 hours per week)":
        return "PT"
    elif x == "Yes, I work part time (11-20 hours per week)":
        return "PT"
    elif x == "Yes, I work part time (<10 hours per week)":
        return "PT"
    elif x == "No, I don't work":
        return "NW"
    else:
        return "Unknown"
    
def fam_transform(x):
    if x == "Live with family/parents":
        return 1
    else:
        return 0
'''
def car_transform(x):
    if x == 0:
        return 0
    elif x == 1: 
        return 1
    else:
        return 2
'''

'\ndef car_transform(x):\n    if x == 0:\n        return 0\n    elif x == 1: \n        return 1\n    else:\n        return 2\n'

In [118]:
df['Work'] = df["Work"].apply(lambda x: work_transform(x))
df['Family'] = df["Family"].apply(lambda x: fam_transform(x))
#df['Cars'] = df["Cars"].apply(lambda x: car_transform(x))
df['Income'] = df["Income"].apply(lambda x: inc_transform(x))
df.head()

Unnamed: 0,Campus,Level,Status,Mode_Actual,Gender,Licence,Work,Age,Exp_SMTO,HomeZone,Family,Cars,Children,Adults,Income,Mode,CampusZone,School_Codes
0,Scarborough (UTSC),UG,FT,Transit Bus,Female,0,Unknown,20,9.7,261,1,1,3,4,Unknown,Transit,566,SC
1,Downtown Toronto (St. George),Grad,FT,Walk,Female,1,Unknown,25,5.79,71,0,0,0,2,High,Active,69,SG
2,Downtown Toronto (St. George),UG,FT,Transit Bus,Female,1,Unknown,23,9.06,3714,1,1,0,4,Unknown,Transit,69,SG
3,Downtown Toronto (St. George),UG,FT,Walk,Male,1,Unknown,20,14.67,74,0,0,0,4,Unknown,Active,69,SG
4,Downtown Toronto (St. George),Grad,FT,Walk,Male,1,Unknown,27,9.11,71,0,0,0,2,Low,Active,69,SG


In [119]:
# Remove others for Level & Status
df = df[df['Level'] != 'Other']
df = df[df['Status'] != 'Other']

In [120]:
def row_to_segment(x):
    
    return {((x.Level == 'UG') & (x.Status == 'FT') & x.Family): 1,
            ((x.Level == 'UG') & (x.Status == 'FT') & (not x.Family)): 2,
            ((x.Level == 'UG') & (x.Status == 'PT')): 3,
            ((x.Level == 'Grad') & (x.Status == 'FT') & x.Family): 4,
            ((x.Level == 'Grad') & (x.Status == 'FT') & (not x.Family)): 5,
            ((x.Level == 'Grad') & (x.Status == 'PT')): 6}.get(True, 0)

df['Segment'] = df.apply(row_to_segment, axis=1)

df.head()

Unnamed: 0,Campus,Level,Status,Mode_Actual,Gender,Licence,Work,Age,Exp_SMTO,HomeZone,Family,Cars,Children,Adults,Income,Mode,CampusZone,School_Codes,Segment
0,Scarborough (UTSC),UG,FT,Transit Bus,Female,0,Unknown,20,9.7,261,1,1,3,4,Unknown,Transit,566,SC,1
1,Downtown Toronto (St. George),Grad,FT,Walk,Female,1,Unknown,25,5.79,71,0,0,0,2,High,Active,69,SG,5
2,Downtown Toronto (St. George),UG,FT,Transit Bus,Female,1,Unknown,23,9.06,3714,1,1,0,4,Unknown,Transit,69,SG,1
3,Downtown Toronto (St. George),UG,FT,Walk,Male,1,Unknown,20,14.67,74,0,0,0,4,Unknown,Active,69,SG,2
4,Downtown Toronto (St. George),Grad,FT,Walk,Male,1,Unknown,27,9.11,71,0,0,0,2,Low,Active,69,SG,5


In [121]:
# Dataframe with walk distances
df_path = pd.read_csv('../GitHub_PORPOS/LoS/Walk_Distances.csv')
origins = list(set(list(df_path['Origin'])))
dists = list(df_path['Data'])

# Dataframe with AutoTravelTimes
df_att = pd.read_csv('../GitHub_PORPOS/LoS/Auto_Travel_Times.csv')
AutoTravelTimes = list(df_att['Data'])

# Dataframe with TransitTravelTimes
df_ttt = pd.read_csv('../GitHub_PORPOS/LoS/Transit_Travel_Times.csv')
TransitTravelTimes = list(df_ttt['Data'])

# Function for distance lookup
not_found = set()
def find_distance(origin, destination, info_num):
    # mode_num: 0 - to find Walking Distance
    #           1 - to find Walking Time
    #           2 - to find Auto Time  
    #           3 - to find Transit Time
    
    try:
        i = origins.index(origin)
    except ValueError:
        not_found.add(origin)
        return -1
    try:
        j = origins.index(destination)
    except ValueError:
        not_found.add(destination)
        return -1
    
    if info_num == 0:
        return dists[i*2392 + j] / 1000
    elif info_num == 1:
        return dists[i*2392 + j] * 15 / 1000 
    elif info_num == 2:
        return AutoTravelTimes[i*2392 + j]
    elif info_num == 3:
        return TransitTravelTimes[i*2392 + j]
    else:
        print("ERROR: Enter correct info_num!")
        return False

# Load distances into dataframe
for i in range(len(campus_zones)):
    df["Dist." + school_codes[i]] = df['HomeZone'].apply(lambda x: find_distance(x, campus_zones[i],0))
for i in range(len(campus_zones)):
    df["WTT." + school_codes[i]] = df['HomeZone'].apply(lambda x: find_distance(x, campus_zones[i],1))
for i in range(len(campus_zones)):
    df["AIVTT." + school_codes[i]] = df['HomeZone'].apply(lambda x: find_distance(x, campus_zones[i],2))
for i in range(len(campus_zones)):
    df["TPTT." + school_codes[i]] = df['HomeZone'].apply(lambda x: find_distance(x, campus_zones[i],3))
    
print("# of zones not found:", len(not_found))

df.head()

# of zones not found: 117


Unnamed: 0,Campus,Level,Status,Mode_Actual,Gender,Licence,Work,Age,Exp_SMTO,HomeZone,...,AIVTT.YG,AIVTT.RY,AIVTT.OC,TPTT.SG,TPTT.SC,TPTT.MI,TPTT.YK,TPTT.YG,TPTT.RY,TPTT.OC
0,Scarborough (UTSC),UG,FT,Transit Bus,Female,0,Unknown,20,9.7,261,...,15.3011,24.89705,27.11227,73.276483,75.468478,129.369386,197.858689,64.076936,75.379996,81.104413
1,Downtown Toronto (St. George),Grad,FT,Walk,Female,1,Unknown,25,5.79,71,...,19.49373,5.748302,5.553657,24.128386,78.020223,77.13505,144.674281,66.918003,31.74921,27.804764
2,Downtown Toronto (St. George),UG,FT,Transit Bus,Female,1,Unknown,23,9.06,3714,...,54.99243,51.37231,48.72094,155.55117,124.359872,42.439563,85.084262,186.874439,147.452825,146.732427
3,Downtown Toronto (St. George),UG,FT,Walk,Male,1,Unknown,20,14.67,74,...,22.30349,4.76811,3.258073,16.675728,88.246135,83.728866,152.561525,77.003957,21.273948,19.897948
4,Downtown Toronto (St. George),Grad,FT,Walk,Male,1,Unknown,27,9.11,71,...,19.49373,5.748302,5.553657,24.128386,78.020223,77.13505,144.674281,66.918003,31.74921,27.804764


In [122]:
# Load enrollment data
enrollment_df = pd.read_csv('../GitHub_PORPOS/PORPOS-TMG/Data/Enrolment/Joven_Enrollment.csv').set_index('School')
def get_log_enrollment(level, school):
    return math.log1p(enrollment_df.loc[school][level]) # Level: UG, Grad, Total
    
for level in ('Total', 'UG', 'Grad'):
    for i in range(len(campus_zones)):
        df[level + "." + school_codes[i]] = get_log_enrollment(level, school_codes[i])

In [123]:
df.head()

Unnamed: 0,Campus,Level,Status,Mode_Actual,Gender,Licence,Work,Age,Exp_SMTO,HomeZone,...,UG.YG,UG.RY,UG.OC,Grad.SG,Grad.SC,Grad.MI,Grad.YK,Grad.YG,Grad.RY,Grad.OC
0,Scarborough (UTSC),UG,FT,Transit Bus,Female,0,Unknown,20,9.7,261,...,7.758761,10.166928,8.107419,9.591376,5.624018,6.385194,8.26359,4.762174,7.665285,5.159055
1,Downtown Toronto (St. George),Grad,FT,Walk,Female,1,Unknown,25,5.79,71,...,7.758761,10.166928,8.107419,9.591376,5.624018,6.385194,8.26359,4.762174,7.665285,5.159055
2,Downtown Toronto (St. George),UG,FT,Transit Bus,Female,1,Unknown,23,9.06,3714,...,7.758761,10.166928,8.107419,9.591376,5.624018,6.385194,8.26359,4.762174,7.665285,5.159055
3,Downtown Toronto (St. George),UG,FT,Walk,Male,1,Unknown,20,14.67,74,...,7.758761,10.166928,8.107419,9.591376,5.624018,6.385194,8.26359,4.762174,7.665285,5.159055
4,Downtown Toronto (St. George),Grad,FT,Walk,Male,1,Unknown,27,9.11,71,...,7.758761,10.166928,8.107419,9.591376,5.624018,6.385194,8.26359,4.762174,7.665285,5.159055


In [124]:
# One column per campus tuition
for i in range(len(campus_zones)):
    df["Tuition." + school_codes[i]] = uni_info['Tuition'][i]
    
# One single tuition col
#df['Tuition'] = df['Campus'].apply(lambda x: uni_info.loc[x]['Tuition'])

In [125]:
# Domestic Percentage Column
for i in range(len(campus_zones)):
    df["Domestic." + school_codes[i]] = df['Level'].apply(lambda x: uni_info['Domestic%_' + ('Grad' if (x == 'Grad') else 'UG')][i])

In [126]:
for i in range(len(campus_zones)):
    df["Admission_Avg." + school_codes[i]] = uni_info['Admission'][i]

In [18]:
list(uni_info.index)

['Downtown Toronto (St. George)',
 'Scarborough (UTSC)',
 'Mississauga (UTM)',
 'Keele',
 'Glendon',
 'RyersonU',
 'OCADu']

In [19]:
enrollment_df

Unnamed: 0_level_0,Total,UG,Grad
School,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SG,53930,39293,14637
SC,11770,11494,276
MI,13298,12706,592
YK,41142,37263,3879
YG,2457,2341,116
RY,28159,26027,2132
OC,3491,3318,173


In [20]:
exp_factors = {}
schoolnames = list(uni_info.index)
for i in range(len(campus_zones)):
    exp_factors[schoolnames[i]] = [enrollment_df.loc[school_codes[i]]['UG'] / enrollment_df['UG'].sum() / df[df['Level'] == 'UG']['Campus'].value_counts(normalize=True)[i]]
    exp_factors[schoolnames[i]].append(enrollment_df.loc[school_codes[i]]['Grad'] / enrollment_df['Grad'].sum() / df[df['Level'] == 'Grad']['Campus'].value_counts(normalize=True)[i]) 
    exp_factors[schoolnames[i]].append(enrollment_df.loc[school_codes[i]]['Total'] / enrollment_df['Total'].sum() / df[df['Level'] == 'Other']['Campus'].value_counts(normalize=True)[i])
    
def load_exp_factors_segment(x):
    if x.Level == 'Other':
        return x.Exp_SMTO
    return exp_factors[x.Campus][0 if x.Level == 'UG' else 1]

def load_exp_factors_level(x):
    return {x.Level == 'UG': exp_factors[x.Campus][0], x.Level == 'Grad': exp_factors[x.Campus][1]}.get(True, exp_factors[x.Campus][2])

df['Exp_Segment'] = df.apply(load_exp_factors_segment, axis = 1)
df['Exp_Level'] = df.apply(load_exp_factors_level, axis = 1)

IndexError: index out of bounds

In [21]:
df.head()

Unnamed: 0,Campus,Level,Status,Mode_Actual,Gender,Licence,Work,Age,Exp_SMTO,HomeZone,...,Domestic.YG,Domestic.RY,Domestic.OC,Admission_Avg.SG,Admission_Avg.SC,Admission_Avg.MI,Admission_Avg.YK,Admission_Avg.YG,Admission_Avg.RY,Admission_Avg.OC
0,Scarborough (UTSC),UG,FT,Transit Bus,Female,0,Unknown,20,9.7,261,...,0.8921,0.9673,0.8998,0.893,0.841,0.83,0.817,0.817,0.84,0.824
1,Downtown Toronto (St. George),Grad,FT,Walk,Female,1,Unknown,25,5.79,71,...,0.8077,0.8837,0.6786,0.893,0.841,0.83,0.817,0.817,0.84,0.824
2,Downtown Toronto (St. George),UG,FT,Transit Bus,Female,1,Unknown,23,9.06,3714,...,0.8921,0.9673,0.8998,0.893,0.841,0.83,0.817,0.817,0.84,0.824
3,Downtown Toronto (St. George),UG,FT,Walk,Male,1,Unknown,20,14.67,74,...,0.8921,0.9673,0.8998,0.893,0.841,0.83,0.817,0.817,0.84,0.824
4,Downtown Toronto (St. George),Grad,FT,Walk,Male,1,Unknown,27,9.11,71,...,0.8077,0.8837,0.6786,0.893,0.841,0.83,0.817,0.817,0.84,0.824


In [127]:
# Remove columns we don't need anymore:
del df['CampusZone']
del df['Exp_SMTO']

In [128]:
for i in school_codes:
    df = df[df['WTT.' + i] != -1]
    df = df[df['AIVTT.' + i] < 1000]

In [129]:
df.shape

(14357, 87)

In [19]:
df.columns

Index(['Campus', 'Level', 'Status', 'Mode_Actual', 'Gender', 'Licence', 'Work',
       'Age', 'HomeZone', 'Family', 'Cars', 'Children', 'Adults', 'Income',
       'Mode', 'School_Codes', 'Segment', 'Dist.SG', 'Dist.SC', 'Dist.MI',
       'Dist.YK', 'Dist.YG', 'Dist.RY', 'Dist.OC', 'WTT.SG', 'WTT.SC',
       'WTT.MI', 'WTT.YK', 'WTT.YG', 'WTT.RY', 'WTT.OC', 'AIVTT.SG',
       'AIVTT.SC', 'AIVTT.MI', 'AIVTT.YK', 'AIVTT.YG', 'AIVTT.RY', 'AIVTT.OC',
       'TPTT.SG', 'TPTT.SC', 'TPTT.MI', 'TPTT.YK', 'TPTT.YG', 'TPTT.RY',
       'TPTT.OC', 'Total.SG', 'Total.SC', 'Total.MI', 'Total.YK', 'Total.YG',
       'Total.RY', 'Total.OC', 'UG.SG', 'UG.SC', 'UG.MI', 'UG.YK', 'UG.YG',
       'UG.RY', 'UG.OC', 'Grad.SG', 'Grad.SC', 'Grad.MI', 'Grad.YK', 'Grad.YG',
       'Grad.RY', 'Grad.OC', 'Tuition.SG', 'Tuition.SC', 'Tuition.MI',
       'Tuition.YK', 'Tuition.YG', 'Tuition.RY', 'Tuition.OC', 'Domestic.SG',
       'Domestic.SC', 'Domestic.MI', 'Domestic.YK', 'Domestic.YG',
       'Domestic.RY', 'Dome

In [130]:
df['Segment'].value_counts()

1    6830
2    3433
5    2222
4     846
3     689
6     337
Name: Segment, dtype: int64

In [26]:
#df.to_csv('SMTO_2015_Complete_Input_NoOthers.csv', index = False)

In [53]:
df.head()

Unnamed: 0,Campus,Level,Status,Mode_Actual,Gender,Licence,Work,Age,HomeZone,Family,...,Domestic.YG,Domestic.RY,Domestic.OC,Admission_Avg.SG,Admission_Avg.SC,Admission_Avg.MI,Admission_Avg.YK,Admission_Avg.YG,Admission_Avg.RY,Admission_Avg.OC
0,Scarborough (UTSC),UG,FT,Transit Bus,Female,0,Unknown,20,261,1,...,0.8921,0.9673,0.8998,0.893,0.841,0.83,0.817,0.817,0.84,0.824
1,Downtown Toronto (St. George),Grad,FT,Walk,Female,1,Unknown,25,71,0,...,0.8077,0.8837,0.6786,0.893,0.841,0.83,0.817,0.817,0.84,0.824
2,Downtown Toronto (St. George),UG,FT,Transit Bus,Female,1,Unknown,23,3714,1,...,0.8921,0.9673,0.8998,0.893,0.841,0.83,0.817,0.817,0.84,0.824
3,Downtown Toronto (St. George),UG,FT,Walk,Male,1,Unknown,20,74,0,...,0.8921,0.9673,0.8998,0.893,0.841,0.83,0.817,0.817,0.84,0.824
4,Downtown Toronto (St. George),Grad,FT,Walk,Male,1,Unknown,27,71,0,...,0.8077,0.8837,0.6786,0.893,0.841,0.83,0.817,0.817,0.84,0.824


In [3]:
pred_df = pd.read_csv('mlogit_prediction.csv')
pred_df

Unnamed: 0,SG,MI,OC,RY,SC,YG,YK
1,0.511608,0.013675,0.045430,0.288467,0.048938,0.022725,0.069158
2,0.515304,0.013940,0.046389,0.291351,0.042024,0.021334,0.069658
3,0.516145,0.013962,0.046465,0.291827,0.040137,0.021691,0.069772
4,0.519701,0.014099,0.046918,0.294671,0.035088,0.019607,0.069917
5,0.522102,0.014164,0.047135,0.296032,0.031581,0.018747,0.070240
6,0.520899,0.014131,0.047026,0.295350,0.032955,0.019562,0.070078
7,0.520711,0.014130,0.047009,0.295243,0.033299,0.019555,0.070052
8,0.518246,0.014063,0.046787,0.293846,0.036784,0.020554,0.069721
9,0.526735,0.014684,0.047553,0.298659,0.026149,0.015357,0.070863
10,0.526212,0.014670,0.047506,0.298362,0.026749,0.015709,0.070793


In [132]:
df = df.set_index(pred_df.index)
df['Pred_School'] = pred_df.idxmax(axis = 1)

In [136]:
df.head()

Unnamed: 0,Campus,Level,Status,Mode_Actual,Gender,Licence,Work,Age,HomeZone,Family,...,Domestic.RY,Domestic.OC,Admission_Avg.SG,Admission_Avg.SC,Admission_Avg.MI,Admission_Avg.YK,Admission_Avg.YG,Admission_Avg.RY,Admission_Avg.OC,Pred_School
1,Scarborough (UTSC),UG,FT,Transit Bus,Female,0,Unknown,20,261,1,...,0.9673,0.8998,0.893,0.841,0.83,0.817,0.817,0.84,0.824,SG
2,Downtown Toronto (St. George),Grad,FT,Walk,Female,1,Unknown,25,71,0,...,0.8837,0.6786,0.893,0.841,0.83,0.817,0.817,0.84,0.824,SG
3,Downtown Toronto (St. George),UG,FT,Transit Bus,Female,1,Unknown,23,3714,1,...,0.9673,0.8998,0.893,0.841,0.83,0.817,0.817,0.84,0.824,SG
4,Downtown Toronto (St. George),UG,FT,Walk,Male,1,Unknown,20,74,0,...,0.9673,0.8998,0.893,0.841,0.83,0.817,0.817,0.84,0.824,SG
5,Downtown Toronto (St. George),Grad,FT,Walk,Male,1,Unknown,27,71,0,...,0.8837,0.6786,0.893,0.841,0.83,0.817,0.817,0.84,0.824,SG


In [137]:
homezones = set(df['HomeZone'].value_counts().index.tolist())
zone_to_campus = {}

for i in homezones:
    SG = ((df['Pred_School'] == 'SG') & (df['HomeZone'] == i)).sum()
    YK = ((df['Pred_School'] == 'YK') & (df['HomeZone'] == i)).sum()
    RY = ((df['Pred_School'] == 'RY') & (df['HomeZone'] == i)).sum()
    SC = ((df['Pred_School'] == 'SC') & (df['HomeZone'] == i)).sum()
    MI = ((df['Pred_School'] == 'MI') & (df['HomeZone'] == i)).sum()
    OC = ((df['Pred_School'] == 'OC') & (df['HomeZone'] == i)).sum()
    YG = ((df['Pred_School'] == 'YG') & (df['HomeZone'] == i)).sum()

    x_names = ['SG','YK','RY','SC','MI','OC','YG']
    x = [SG,YK,RY,SC,MI,OC,YG]
    x_num = [1,2,3,4,5,6,7] 
    
    zone_to_campus[i] = x_names[x.index(max(x))]

In [138]:
top_school_df = df.from_dict(zone_to_campus,orient='index')
print(top_school_df.shape)
top_school_df.head()

(1430, 1)


Unnamed: 0,0
1,SG
10,SG
14,SG
15,SG
16,SG


In [139]:
top_school_df = top_school_df.rename(columns = {0:'code'})

In [140]:
def code_to_num(x):
    if x == 'SG':
        return 1
    elif x == 'YK':
        return 2
    elif x == 'RY':
        return 3
    elif x == 'SC':
        return 4
    elif x == 'MI':
        return 5
    elif x == 'OC':
        return 6
    elif x == 'YG':
        return 7

top_school_df['num'] = top_school_df['code'].apply(lambda x: code_to_num(x))

In [141]:
top_school_df.head()

Unnamed: 0,code,num
1,SG,1
10,SG,1
14,SG,1
15,SG,1
16,SG,1


In [142]:
arc_df = pd.read_csv('arcGis_data.csv')
print(arc_df.shape)
arc_df.head()

(3764, 11)


Unnamed: 0,FID,Shape,NUM,GTA06,PD,REGION,GTA01,AREA_M,AREA_H,UTMX_CENT,UTMY_CENT
0,2151,Polygon,1,1,1,1,359,0,0,0,0
1,1826,Polygon,2,2,1,1,360,0,0,0,0
2,2153,Polygon,3,3,1,1,359,0,0,0,0
3,1961,Polygon,4,4,1,1,478,0,0,0,0
4,1960,Polygon,5,5,1,1,478,0,0,0,0


In [143]:
import numpy as np
arc_df['PredSchool'] = np.nan
arc_df.head()

Unnamed: 0,FID,Shape,NUM,GTA06,PD,REGION,GTA01,AREA_M,AREA_H,UTMX_CENT,UTMY_CENT,PredSchool
0,2151,Polygon,1,1,1,1,359,0,0,0,0,
1,1826,Polygon,2,2,1,1,360,0,0,0,0,
2,2153,Polygon,3,3,1,1,359,0,0,0,0,
3,1961,Polygon,4,4,1,1,478,0,0,0,0,
4,1960,Polygon,5,5,1,1,478,0,0,0,0,


In [144]:
my_zones = list(top_school_df.index)
all_zones = list(arc_df['GTA06'])

for i in range(len(my_zones)):
    arc_df.at[all_zones.index(my_zones[i]), 'PredSchool'] = list(top_school_df['num'])[i]   

In [145]:
arc_df.head()

Unnamed: 0,FID,Shape,NUM,GTA06,PD,REGION,GTA01,AREA_M,AREA_H,UTMX_CENT,UTMY_CENT,PredSchool
0,2151,Polygon,1,1,1,1,359,0,0,0,0,1.0
1,1826,Polygon,2,2,1,1,360,0,0,0,0,
2,2153,Polygon,3,3,1,1,359,0,0,0,0,
3,1961,Polygon,4,4,1,1,478,0,0,0,0,
4,1960,Polygon,5,5,1,1,478,0,0,0,0,


In [149]:
df['Pred_School'].value_counts()

SG    9504
YK    3516
SC    1263
MI      74
Name: Pred_School, dtype: int64

In [146]:
#arc_df.to_csv('PredSchool_col.csv', index = False)