In [1]:
import pandas as pd
import math
import csv

hh_df = pd.read_csv('../../Data/SMTO_2015/SMTO_2015_Households.csv')
ps_df = pd.read_csv('../../Data/SMTO_2015/SMTO_2015_Respondents.csv')
campus_info = pd.read_csv('Campus_Info.csv')

# Load relevant columns
df = ps_df[['pscampusattend', 'personstatusgrad', 'personstatustime', 'psuniversityinvolvednumyears', 'psexpfactor']]
df = df.join(hh_df[['HmTTS2006', 'hhlivingsituation']])
df = df.rename(columns={'HmTTS2006': 'HomeZone', 'pscampusattend': 'Campus', 'hhlivingsituation': 'Family', 
                       'personstatusgrad': 'Level', 'personstatustime': 'Status', 'psuniversityinvolvednumyears': 'Years',
                       'psexpfactor': 'Exp_SMTO'})
df = df.dropna() # Remove rows with missing data

# Convert Campus column to numerical column
df['Campus'] = df["Campus"].apply(lambda x: campus_info.Code[campus_info['School'] == x].tolist()[0])

# Convert columns to numerical
df['HomeZone'] = pd.to_numeric(df['HomeZone'], downcast='signed')
df['Family'] = (df['Family'] == 'Live with family/parents')*1

In [2]:
def row_to_segment(x):
    if x.Level == 'Other':
        return 0
    elif x.Level == 'UG':
        return 3 if x.Status == 'PT' else (not x.Family) + 1
    else:
        return 6 if x.Status == 'PT' else (4 + (not x.Family))

df['Segment'] = df.apply(row_to_segment, axis=1)

In [3]:
# Dataframe with walk distances
df_path = pd.read_csv('../../../LoS/Walk_Distances.csv')
origins = list(set(list(df_path['Origin'])))
dists = list(df_path['Data'])

# Function for distance lookup
not_found = set()
def find_distance(origin, destination):
    try:
        i = origins.index(origin)
    except ValueError:
        not_found.add(origin)
        return 0
    try:
        j = origins.index(destination)
    except ValueError:
        not_found.add(destination)
        return 0
    return dists[i*2392 + j] / 1000

# List of campus' TTS zones from Joven's MOE data
campus_zones = list(campus_info['Zone'])
school_codes = list(campus_info['Code'])

# Load distances into dataframe
for i in range(len(campus_zones)):
    df["Dist." + school_codes[i]] = df['HomeZone'].apply(lambda x: find_distance(x, campus_zones[i]))
print("# of zones not found:", len(not_found))

# of zones not found: 127


In [4]:
# Load enrollment data
enrollment_df = pd.read_csv('../../Data/Enrolment/Joven_Enrollment.csv').set_index('School')
def get_log_enrollment(level, school):
    return math.log1p(enrollment_df.loc[school][level]) # Level: UG, Grad, Total
    
for level in ('Total', 'UG', 'Grad'):
    for i in range(len(campus_zones)):
        df[level + "." + school_codes[i]] = get_log_enrollment(level, school_codes[i])

In [5]:
df = df[(df['Dist.SG'] > 0) | (df['Dist.SC'] > 0)]

In [6]:
total_factors = {}
for val, count in df['Campus'].value_counts(normalize=True).iteritems():
    total_factors[val] = enrollment_df.loc[val]['Total'] / enrollment_df['Total'].sum() / count
df['Exp_Total'] = df['Campus'].apply(lambda x: total_factors[x])

ug_factors = {}
for val, count in df[df['Level'] == 'UG']['Campus'].value_counts(normalize=True).iteritems():
    ug_factors[val] = enrollment_df.loc[val][level] / enrollment_df[level].sum() / count

grad_factors = {}
for val, count in df[df['Level'] == 'Grad']['Campus'].value_counts(normalize=True).iteritems():
    grad_factors[val] = enrollment_df.loc[val][level] / enrollment_df[level].sum() / count

def row_to_factor(row):
    if row['Level'] == 'UG':
        return ug_factors[row['Campus']]
    elif row['Level'] == 'Grad':
        return grad_factors[row['Campus']]
    else:
        return total_factors[row['Campus']]

df['Exp_Level'] = df.apply(row_to_factor, axis=1)

In [7]:
df = df.drop(columns=['Family', 'Level', 'Status', 'Years', 'HomeZone'])
df.to_csv('Segment_Exp_Input.csv', index=False)