In [1]:
# Load packages
import pandas as pd
import math
import csv
import numpy as np

In [2]:
# Load SMTO 2015 data
df = pd.read_csv('../../Data/SMTO_2015/SMTO_2015_Complete_Input.csv')
df.head()

Unnamed: 0,Campus,Level,Status,Mode_Actual,Gender,Licence,Work,Age,HomeZone,Family,...,Domestic.OC,Admission_Avg.SG,Admission_Avg.SC,Admission_Avg.MI,Admission_Avg.YK,Admission_Avg.YG,Admission_Avg.RY,Admission_Avg.OC,Exp_Segment,Exp_Level
0,Scarborough (UTSC),UG,FT,Transit Bus,Female,0,Unknown,20,261,1,...,0.8998,0.893,0.841,0.83,0.817,0.817,0.84,0.824,0.944738,0.944738
1,Downtown Toronto (St. George),Grad,FT,Walk,Female,1,Unknown,25,71,0,...,0.6786,0.893,0.841,0.83,0.817,0.817,0.84,0.824,0.986085,0.986085
2,Downtown Toronto (St. George),UG,FT,Transit Bus,Female,1,Unknown,23,3714,1,...,0.8998,0.893,0.841,0.83,0.817,0.817,0.84,0.824,0.91927,0.91927
3,Downtown Toronto (St. George),UG,FT,Walk,Male,1,Unknown,20,74,0,...,0.8998,0.893,0.841,0.83,0.817,0.817,0.84,0.824,0.91927,0.91927
4,Downtown Toronto (St. George),Grad,FT,Walk,Male,1,Unknown,27,71,0,...,0.6786,0.893,0.841,0.83,0.817,0.817,0.84,0.824,0.986085,0.986085


In [3]:
# Dataframe for mapping with zones as indices
map_df = pd.DataFrame()
map_df['Origin'] = pd.read_csv('../Needed_Zones.csv')['Origin']
map_df.set_index(map_df['Origin'], inplace = True)

In [4]:
# Code adapted from https://intellipaat.com/community/20470/groupby-pandas-dataframe-and-select-most-common-value
map_df['Sample_TopSchool'] = df.groupby('HomeZone')['School_Codes'].agg(lambda x: x.value_counts().index[0])

In [5]:
# School in-sample proportions
def get_school_prop(zone, school, grouped):
    if zone in grouped:
        if school in grouped[zone]:
            return grouped[zone][school]
        else:
            return 0
    return None

codes = df['School_Codes'].unique()
for temp_df, name in ((df, 'Sample'), (df[df['Family'] == 1], 'Family'), (df[df['Family'] == 0], 'NonFam')):
    counts = temp_df['School_Codes'].value_counts(normalize = True)
    grouped = temp_df.groupby('HomeZone')['School_Codes'].value_counts(normalize=True)
    for code in codes:
        map_df[code + '_' + name + '_Prob'] = map_df['Origin'].apply(lambda x: get_school_prop(x, code, grouped)) / counts[code]
        
map_df.columns

Index(['Origin', 'Sample_TopSchool', 'SC_Sample_Prob', 'SG_Sample_Prob',
       'MI_Sample_Prob', 'OC_Sample_Prob', 'RY_Sample_Prob', 'YK_Sample_Prob',
       'YG_Sample_Prob', 'SC_Family_Prob', 'SG_Family_Prob', 'MI_Family_Prob',
       'OC_Family_Prob', 'RY_Family_Prob', 'YK_Family_Prob', 'YG_Family_Prob',
       'SC_NonFam_Prob', 'SG_NonFam_Prob', 'MI_NonFam_Prob', 'OC_NonFam_Prob',
       'RY_NonFam_Prob', 'YK_NonFam_Prob', 'YG_NonFam_Prob'],
      dtype='object')

In [6]:
sample_props = df['School_Codes'].value_counts(normalize = True)
sample_props

SG    0.405321
YK    0.211937
RY    0.195157
SC    0.073602
MI    0.061050
OC    0.031105
YG    0.021828
Name: School_Codes, dtype: float64

In [7]:
for file, name in (('mlogit_prediction.csv', 'Grav'), ('RF_Probabilities_HZ.csv', 'HZ'), ('RF_Probabilities_dist.csv', 'Dist')):
    file = '../Map Projects/SMTO_2015_Maps - Files/' + file
    preds = pd.read_csv(file)
    preds.drop(preds.index[2272:2392],inplace = True)
    preds.set_index(map_df['Origin'], inplace = True)
    if 'Origin' in preds.columns:
        del preds['Origin']
    map_df[name + '_Pred'] = preds.idxmax(axis=1)
    for col in preds.columns:
        map_df[col + '_Norm_' + name] = preds[col] / sample_props[col]

map_df.columns

Index(['Origin', 'Sample_TopSchool', 'SC_Sample_Prob', 'SG_Sample_Prob',
       'MI_Sample_Prob', 'OC_Sample_Prob', 'RY_Sample_Prob', 'YK_Sample_Prob',
       'YG_Sample_Prob', 'SC_Family_Prob', 'SG_Family_Prob', 'MI_Family_Prob',
       'OC_Family_Prob', 'RY_Family_Prob', 'YK_Family_Prob', 'YG_Family_Prob',
       'SC_NonFam_Prob', 'SG_NonFam_Prob', 'MI_NonFam_Prob', 'OC_NonFam_Prob',
       'RY_NonFam_Prob', 'YK_NonFam_Prob', 'YG_NonFam_Prob', 'Grav_Pred',
       'SG_Norm_Grav', 'MI_Norm_Grav', 'OC_Norm_Grav', 'RY_Norm_Grav',
       'SC_Norm_Grav', 'YG_Norm_Grav', 'YK_Norm_Grav', 'HZ_Pred', 'MI_Norm_HZ',
       'OC_Norm_HZ', 'RY_Norm_HZ', 'SC_Norm_HZ', 'SG_Norm_HZ', 'YG_Norm_HZ',
       'YK_Norm_HZ', 'Dist_Pred', 'MI_Norm_Dist', 'OC_Norm_Dist',
       'RY_Norm_Dist', 'SC_Norm_Dist', 'SG_Norm_Dist', 'YG_Norm_Dist',
       'YK_Norm_Dist'],
      dtype='object')

In [8]:
map_df.to_csv('../Map_Inputs.csv', index = False)