In [1]:
import pandas as pd
import math
import csv

hh_df = pd.read_csv('../../Data/SMTO_2015/SMTO_2015_Households.csv')
ps_df = pd.read_csv('../../Data/SMTO_2015/SMTO_2015_Respondents.csv')
campus_info = pd.read_csv('Campus_Info.csv')

# Load relevant columns
df = ps_df[['pscampusattend', 'personstatusgrad', 'personstatustime', 'psuniversityinvolvednumyears']]
df = df.join(hh_df[['HmTTS2006', 'hhlivingsituation']])
df = df.rename(columns={'HmTTS2006': 'HomeZone', 'pscampusattend': 'Campus', 'hhlivingsituation': 'Family', 
                       'personstatusgrad': 'Level', 'personstatustime': 'Status', 'psuniversityinvolvednumyears': 'Years'})
df = df.dropna() # Remove rows with missing data

# Convert Campus column to numerical column
df['Campus'] = df["Campus"].apply(lambda x: campus_info.Code[campus_info['School'] == x].tolist()[0])

# Convert columns to numerical
df['HomeZone'] = pd.to_numeric(df['HomeZone'], downcast='signed')
df['Family'] = (df['Family'] == 'Live with family/parents')*1

In [2]:
def row_to_segment(x):
    if x.Level == 'Other':
        return 0
    elif x.Level == 'UG':
        return 3 if x.Status == 'PT' else (not x.Family) + 1
    else:
        return 6 if x.Status == 'PT' else (4 + (not x.Family))

df['Segment'] = df.apply(row_to_segment, axis=1)

In [3]:
# Dataframe with walk distances
df_path = pd.read_csv('../../../LoS/Walk_Distances.csv')
origins = list(set(list(df_path['Origin'])))
dists = list(df_path['Data'])

# Function for distance lookup
not_found = set()
def find_distance(origin, destination):
    try:
        i = origins.index(origin)
    except ValueError:
        not_found.add(origin)
        return 0
    try:
        j = origins.index(destination)
    except ValueError:
        not_found.add(destination)
        return 0
    return dists[i*2392 + j] / 1000

# List of campus' TTS zones from Joven's MOE data
campus_zones = list(campus_info['Zone'])
school_codes = list(campus_info['Code'])

# Load distances into dataframe
for i in range(len(campus_zones)):
    df["Dist." + school_codes[i]] = df['HomeZone'].apply(lambda x: find_distance(x, campus_zones[i]))
print("# of zones not found:", len(not_found))

# of zones not found: 127


In [4]:
# Load enrollment data
enrollment_df = pd.read_csv('../../Data/Enrolment/Joven_Enrollment.csv').set_index('School')
def get_log_enrollment(level, school):
    return math.log1p(enrollment_df.loc[school][level]) # Level: UG, Grad, Total
    
for level in ('Total', 'UG', 'Grad'):
    for i in range(len(campus_zones)):
        df[level + "." + school_codes[i]] = get_log_enrollment(level, school_codes[i])

In [5]:
df = df[(df['Dist.SG'] > 0) | (df['Dist.SC'] > 0)]
df = df.drop(columns=['Family', 'Level', 'Status', 'Years', 'HomeZone'])
for i in range(7):
    for campus, count in df[df['Segment'] == i]['Campus'].value_counts().iteritems():
        df["S" + str(i) + "." + campus] = count

In [6]:
df

Unnamed: 0,Campus,Segment,Dist.SG,Dist.SC,Dist.MI,Dist.YK,Dist.YG,Dist.RY,Dist.OC,Total.SG,...,S5.SC,S5.OC,S5.YG,S6.SG,S6.YK,S6.RY,S6.OC,S6.YG,S6.SC,S6.MI
0,SC,1,10.256060,14.88098,29.20657,22.59214,9.218413,9.580635,11.241730,10.895461,...,41,28,11,206,83,33,13,3,1,1
1,SG,5,1.132351,23.03920,19.64290,15.87906,11.211150,2.675173,2.723838,10.895461,...,41,28,11,206,83,33,13,3,1,1
2,SG,1,23.319230,45.63271,4.51742,28.58045,32.555200,24.964000,23.686150,10.895461,...,41,28,11,206,83,33,13,3,1,1
3,SG,2,0.699414,24.11954,19.43932,16.81186,12.830410,2.314008,1.541276,10.895461,...,41,28,11,206,83,33,13,3,1,1
4,SG,5,1.132351,23.03920,19.64290,15.87906,11.211150,2.675173,2.723838,10.895461,...,41,28,11,206,83,33,13,3,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15221,YK,2,7.120260,19.01731,23.76046,14.19530,5.732956,6.553092,8.193741,10.895461,...,41,28,11,206,83,33,13,3,1,1
15222,YK,1,15.917590,12.03644,32.96591,17.68772,6.019180,15.242170,16.903260,10.895461,...,41,28,11,206,83,33,13,3,1,1
15223,YK,2,2.783940,25.09743,17.84462,15.53600,12.829600,4.733398,3.979057,10.895461,...,41,28,11,206,83,33,13,3,1,1
15224,YK,1,23.379880,26.15476,37.35434,13.30458,15.379040,23.250580,24.518920,10.895461,...,41,28,11,206,83,33,13,3,1,1


In [7]:
df.to_csv('R_Segmented_Input.csv')