In [1]:
# Load packages
import pandas as pd
import math
import csv
import numpy as np

In [2]:
# Load SMTO 2015 data
df = pd.read_csv('../../Data/SMTO_2015/SMTO_2015_Complete_Input.csv')
df.head()

Unnamed: 0,Campus,Level,Status,Mode_Actual,Gender,Licence,Work,Age,HomeZone,Family,...,Admission_Avg.YK,Admission_Avg.YG,Admission_Avg.RY,Admission_Avg.OC,Exp_Segment,Exp_Level,Time.Active,Time.Auto,Time.Transit,Closest_School
0,Scarborough (UTSC),UG,FT,Transit Bus,Female,0,Unknown,20,261,1,...,0.817,0.817,0.84,0.824,0.949782,0.949782,223.2147,17.47422,75.468478,YG
1,Downtown Toronto (St. George),Grad,FT,Walk,Female,1,Unknown,25,71,0,...,0.817,0.817,0.84,0.824,0.98952,0.98952,16.985265,2.924953,24.128386,SG
2,Downtown Toronto (St. George),UG,FT,Transit Bus,Female,1,Unknown,23,3714,1,...,0.817,0.817,0.84,0.824,0.919307,0.919307,349.78845,50.17188,155.55117,MI
3,Downtown Toronto (St. George),UG,FT,Walk,Male,1,Unknown,20,74,0,...,0.817,0.817,0.84,0.824,0.919307,0.919307,10.49121,1.049121,16.675728,SG
4,Downtown Toronto (St. George),Grad,FT,Walk,Male,1,Unknown,27,71,0,...,0.817,0.817,0.84,0.824,0.98952,0.98952,16.985265,2.924953,24.128386,SG


In [3]:
# Dataframe for mapping with zones as indices
map_df = pd.DataFrame()
map_df['Origin'] = pd.read_csv('../Needed_Zones.csv')['Origin']
map_df.set_index(map_df['Origin'], inplace = True)

In [4]:
# Code adapted from https://intellipaat.com/community/20470/groupby-pandas-dataframe-and-select-most-common-value
map_df['Sample_TopSchool'] = df.groupby('HomeZone')['School'].agg(lambda x: x.value_counts().index[0])

In [5]:
# School in-sample proportions
def get_school_prop(zone, school, grouped):
    if zone in grouped:
        if school in grouped[zone]:
            return grouped[zone][school]
        else:
            return 0
    return None

codes = df['School'].unique()
for temp_df, name in ((df, 'Sample'), (df[df['Family'] == 1], 'Family'), (df[df['Family'] == 0], 'NonFam')):
    counts = temp_df['School'].value_counts(normalize = True)
    grouped = temp_df.groupby('HomeZone')['School'].value_counts(normalize=True)
    for code in codes:
        map_df[code + '_' + name + '_Prob'] = map_df['Origin'].apply(lambda x: get_school_prop(x, code, grouped)) / counts[code]
        
map_df.columns

Index(['Origin', 'Sample_TopSchool', 'SC_Sample_Prob', 'SG_Sample_Prob',
       'MI_Sample_Prob', 'OC_Sample_Prob', 'RY_Sample_Prob', 'YK_Sample_Prob',
       'YG_Sample_Prob', 'SC_Family_Prob', 'SG_Family_Prob', 'MI_Family_Prob',
       'OC_Family_Prob', 'RY_Family_Prob', 'YK_Family_Prob', 'YG_Family_Prob',
       'SC_NonFam_Prob', 'SG_NonFam_Prob', 'MI_NonFam_Prob', 'OC_NonFam_Prob',
       'RY_NonFam_Prob', 'YK_NonFam_Prob', 'YG_NonFam_Prob'],
      dtype='object')

In [6]:
sample_props = df['School'].value_counts(normalize = True)
sample_props

SG    0.404952
YK    0.211203
RY    0.194223
SC    0.073129
MI    0.063456
OC    0.030984
YG    0.022054
Name: School, dtype: float64

In [7]:
for file, name in (('mlogit_prediction.csv', 'Grav'), ('RF_Probabilities_HZ.csv', 'HZ'), ('RF_Probabilities_dist.csv', 'Dist')):
    file = '../Map Projects/SMTO_2015_Maps - Files/' + file
    preds = pd.read_csv(file)
    preds.drop(preds.index[2272:2392],inplace = True)
    preds.set_index(map_df['Origin'], inplace = True)
    if 'Origin' in preds.columns:
        del preds['Origin']
    map_df[name + '_Pred'] = preds.idxmax(axis=1)
    for col in preds.columns:
        map_df[col + '_Norm_' + name] = preds[col] / sample_props[col]

map_df.columns

Index(['Origin', 'Sample_TopSchool', 'SC_Sample_Prob', 'SG_Sample_Prob',
       'MI_Sample_Prob', 'OC_Sample_Prob', 'RY_Sample_Prob', 'YK_Sample_Prob',
       'YG_Sample_Prob', 'SC_Family_Prob', 'SG_Family_Prob', 'MI_Family_Prob',
       'OC_Family_Prob', 'RY_Family_Prob', 'YK_Family_Prob', 'YG_Family_Prob',
       'SC_NonFam_Prob', 'SG_NonFam_Prob', 'MI_NonFam_Prob', 'OC_NonFam_Prob',
       'RY_NonFam_Prob', 'YK_NonFam_Prob', 'YG_NonFam_Prob', 'Grav_Pred',
       'SG_Norm_Grav', 'MI_Norm_Grav', 'OC_Norm_Grav', 'RY_Norm_Grav',
       'SC_Norm_Grav', 'YG_Norm_Grav', 'YK_Norm_Grav', 'HZ_Pred', 'MI_Norm_HZ',
       'OC_Norm_HZ', 'RY_Norm_HZ', 'SC_Norm_HZ', 'SG_Norm_HZ', 'YG_Norm_HZ',
       'YK_Norm_HZ', 'Dist_Pred', 'MI_Norm_Dist', 'OC_Norm_Dist',
       'RY_Norm_Dist', 'SC_Norm_Dist', 'SG_Norm_Dist', 'YG_Norm_Dist',
       'YK_Norm_Dist'],
      dtype='object')

In [8]:
map_df

Unnamed: 0_level_0,Origin,Sample_TopSchool,SC_Sample_Prob,SG_Sample_Prob,MI_Sample_Prob,OC_Sample_Prob,RY_Sample_Prob,YK_Sample_Prob,YG_Sample_Prob,SC_Family_Prob,...,YG_Norm_HZ,YK_Norm_HZ,Dist_Pred,MI_Norm_Dist,OC_Norm_Dist,RY_Norm_Dist,SC_Norm_Dist,SG_Norm_Dist,YG_Norm_Dist,YK_Norm_Dist
Origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,RY,4.558125,0.000000,0.000000,0.000000,3.432486,0.000000,0.000000,3.496017,...,0.000000,0.000000,RY,0.000000,0.024827,3.524268,3.960233,0.036725,0.344921,0.012545
2,2,,,,,,,,,,...,0.000000,0.000000,RY,0.153335,1.447731,1.949286,1.192623,0.752907,2.768427,0.538141
3,3,,,,,,,,,,...,0.000000,0.000000,RY,0.141970,1.967291,1.948093,0.494896,0.767306,1.715604,0.790350
4,4,,,,,,,,,,...,0.000000,0.000000,RY,0.041803,0.561950,2.433308,2.775931,0.618914,0.659849,0.185354
5,5,,,,,,,,,,...,0.000000,0.000000,SG,0.448040,1.262942,1.359179,0.635336,1.428505,0.065529,0.199205
6,6,,,,,,,,,,...,0.000000,0.000000,RY,0.112551,1.090345,2.083060,1.848332,0.766293,1.853152,0.322671
7,7,,,,,,,,,,...,0.000000,0.000000,RY,0.150537,1.456699,1.693877,1.717918,0.788480,2.187199,0.583143
8,8,,,,,,,,,,...,0.000000,0.000000,RY,0.198940,0.949402,1.851404,1.648777,0.784562,2.745026,0.471363
9,9,,,,,,,,,,...,0.000000,0.024676,SG,0.061427,0.558819,1.756864,1.976008,1.014281,0.971770,0.288320
10,10,SG,0.000000,2.469429,0.000000,0.000000,0.000000,0.000000,0.000000,,...,0.000000,0.024676,SG,0.037658,0.660139,0.365606,1.725008,1.893661,0.136031,0.048092


In [9]:
residuals = pd.read_csv('../../R_Logit_Models/Location_Choice/Proposed/Residuals_All_Zones.csv')
residuals.set_index(residuals['Zone'], inplace = True)
residuals.head()

Unnamed: 0_level_0,Zone,SG,MI,OC,RY,SC,YG,YK
Zone,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,0.511415,0.028685,0.033681,-0.414492,-0.285557,0.018145,0.108124
10,10,-0.2212,0.003294,0.015571,0.170445,0.002252,0.003341,0.026297
14,14,0.45466,0.012651,0.055237,0.400845,-0.98517,0.012164,0.049613
15,15,-0.128943,0.017085,-0.063417,0.284203,-0.083101,0.009921,-0.035749
16,16,0.117226,0.007814,-0.064694,-0.114059,0.008111,0.007423,0.03818


In [10]:
result = pd.concat([map_df, residuals], axis=1, sort=False)
result

Unnamed: 0,Origin,Sample_TopSchool,SC_Sample_Prob,SG_Sample_Prob,MI_Sample_Prob,OC_Sample_Prob,RY_Sample_Prob,YK_Sample_Prob,YG_Sample_Prob,SC_Family_Prob,...,YG_Norm_Dist,YK_Norm_Dist,Zone,SG,MI,OC,RY,SC,YG,YK
1,1.0,RY,4.558125,0.000000,0.000000,0.000000,3.432486,0.000000,0.000000,3.496017,...,0.344921,0.012545,1.0,0.511415,0.028685,0.033681,-0.414492,-0.285557,0.018145,0.108124
2,2.0,,,,,,,,,,...,2.768427,0.538141,,,,,,,,
3,3.0,,,,,,,,,,...,1.715604,0.790350,,,,,,,,
4,4.0,,,,,,,,,,...,0.659849,0.185354,,,,,,,,
5,5.0,,,,,,,,,,...,0.065529,0.199205,,,,,,,,
6,6.0,,,,,,,,,,...,1.853152,0.322671,,,,,,,,
7,7.0,,,,,,,,,,...,2.187199,0.583143,,,,,,,,
8,8.0,,,,,,,,,,...,2.745026,0.471363,,,,,,,,
9,9.0,,,,,,,,,,...,0.971770,0.288320,,,,,,,,
10,10.0,SG,0.000000,2.469429,0.000000,0.000000,0.000000,0.000000,0.000000,,...,0.136031,0.048092,10.0,-0.221200,0.003294,0.015571,0.170445,0.002252,0.003341,0.026297


In [11]:
result = result[result.index <= 5253]
result

Unnamed: 0,Origin,Sample_TopSchool,SC_Sample_Prob,SG_Sample_Prob,MI_Sample_Prob,OC_Sample_Prob,RY_Sample_Prob,YK_Sample_Prob,YG_Sample_Prob,SC_Family_Prob,...,YG_Norm_Dist,YK_Norm_Dist,Zone,SG,MI,OC,RY,SC,YG,YK
1,1.0,RY,4.558125,0.000000,0.000000,0.000000,3.432486,0.000000,0.000000,3.496017,...,0.344921,0.012545,1.0,0.511415,0.028685,0.033681,-0.414492,-0.285557,0.018145,0.108124
2,2.0,,,,,,,,,,...,2.768427,0.538141,,,,,,,,
3,3.0,,,,,,,,,,...,1.715604,0.790350,,,,,,,,
4,4.0,,,,,,,,,,...,0.659849,0.185354,,,,,,,,
5,5.0,,,,,,,,,,...,0.065529,0.199205,,,,,,,,
6,6.0,,,,,,,,,,...,1.853152,0.322671,,,,,,,,
7,7.0,,,,,,,,,,...,2.187199,0.583143,,,,,,,,
8,8.0,,,,,,,,,,...,2.745026,0.471363,,,,,,,,
9,9.0,,,,,,,,,,...,0.971770,0.288320,,,,,,,,
10,10.0,SG,0.000000,2.469429,0.000000,0.000000,0.000000,0.000000,0.000000,,...,0.136031,0.048092,10.0,-0.221200,0.003294,0.015571,0.170445,0.002252,0.003341,0.026297


In [12]:
result.to_csv('../Map Projects/2015 SMTO Residuals Map - Files/Map_Inputs.csv', index = False)