In [1]:
import pandas as pd

# Load data
campus_info = pd.read_csv('../Data/SMTO_2015/Campus_Info.csv')
df = pd.read_csv('../Data/SMTO_2015/SMTO_2015_Households.csv')[['HmTTS2006', 'hhlivingsituation', 'hhcarnumber','hhnumyoungerthan18','hhnumolderorequalto18','hhincomelevel']]
df = df.join(pd.read_csv('../Data/SMTO_2015/SMTO_2015_Respondents.csv')[['pscampusattend', 'personstatusgrad', 'personstatustime', 'pscmpgender','psdrivinglicenseownerflag','psworknumhoursperweek']])
df.head()

Unnamed: 0,HmTTS2006,hhlivingsituation,hhcarnumber,hhnumyoungerthan18,hhnumolderorequalto18,hhincomelevel,pscampusattend,personstatusgrad,personstatustime,pscmpgender,psdrivinglicenseownerflag,psworknumhoursperweek
0,261.0,Live with family/parents,1,3,4,Unknown,Scarborough (UTSC),UG,FT,Female,0,
1,71.0,Live with partner,0,0,2,"$ 90,000 - 119,999",Downtown Toronto (St. George),Grad,FT,Female,1,
2,3714.0,Live with family/parents,1,0,4,Unknown,Downtown Toronto (St. George),UG,FT,Female,1,
3,74.0,Live with roommates,0,0,4,Unknown,Downtown Toronto (St. George),UG,FT,Male,1,
4,71.0,Live with partner,0,0,2,"$ 30,000 - 59,999",Downtown Toronto (St. George),Grad,FT,Male,1,


In [2]:
df = df.rename(columns={'HmTTS2006': 'HomeZone', 'pscampusattend': 'Campus', 'hhlivingsituation': 'Family', 
                       'personstatusgrad': 'Level', 'personstatustime': 'Status', 'psuniversityinvolvednumyears': 'Years', 'hhcarnumber': 'Cars',
                       'hhincomelevel': 'Income', 'pscmpgender': 'Gender', 'psdrivinglicenseownerflag': 'Licence', 'psworknumhoursperweek': 'Work',
                       'hhnumyoungerthan18': 'Children', 'hhnumolderorequalto18':'Adults'})
df.columns

Index(['HomeZone', 'Family', 'Cars', 'Children', 'Adults', 'Income', 'Campus',
       'Level', 'Status', 'Gender', 'Licence', 'Work'],
      dtype='object')

In [3]:
# Clean and reformat
df = df[df['HomeZone'].notnull()]
df = df[df['Campus'].notnull()]
df['HomeZone'] = pd.to_numeric(df['HomeZone'], downcast='signed')
df['Family'] = (df['Family'] == 'Live with family/parents')*1
df.columns

Index(['HomeZone', 'Family', 'Cars', 'Children', 'Adults', 'Income', 'Campus',
       'Level', 'Status', 'Gender', 'Licence', 'Work'],
      dtype='object')

In [4]:
# Dataframe with walk distances
df_path = pd.read_csv('../../LoS/Walk_Distances.csv')
origins = list(set(list(df_path['Origin'])))
dists = list(df_path['Data'])

# Function for distance lookup
def find_distance(origin, destination):
    try:
        i = origins.index(origin)
    except ValueError:
        return -1
    j = origins.index(destination)
    return dists[i*2392 + j] / 1000

# List of campus' TTS zones from Joven's MOE data
campus_zones = list(campus_info['Zone'])
school_codes = list(campus_info['Code'])

# Load distances into dataframe
for i in range(len(campus_zones)):
    df["Dist." + school_codes[i]] = df['HomeZone'].apply(lambda x: find_distance(x, campus_zones[i]))
df = df[df['Dist.SG'] != -1]
df.columns

Index(['HomeZone', 'Family', 'Cars', 'Children', 'Adults', 'Income', 'Campus',
       'Level', 'Status', 'Gender', 'Licence', 'Work', 'Dist.SG', 'Dist.SC',
       'Dist.MI', 'Dist.YK', 'Dist.YG', 'Dist.RY', 'Dist.OC'],
      dtype='object')

In [5]:
df2 = pd.DataFrame(df.iloc[:, 12:19].apply(lambda x: x.nlargest(7).index.tolist(), axis=1).tolist(), index=df.index)
df = pd.concat((df, df2), axis=1)
df2 = pd.DataFrame(df.iloc[:, 12:19].apply(lambda x: x.nlargest(7).tolist(), axis=1).tolist(), index=df.index)
df = pd.concat((df, df2), axis=1)
print(df.columns)
df.head()

In [47]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100)

y = df['Campus']
#x = df.drop(columns = ['HomeZone', 'Campus', 'Work'])
#x = df[['Dist.SG', 'Dist.SC', 'Dist.MI', 'Dist.YK', 'Dist.YG', 'Dist.RY', 'Dist.OC']]
#x = df['Dist.SG'].notnull()

res = []

def average(l):
    return sum(l) / len(l)
    
def get_results(x, print_vars = False):
    if print_vars:
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_test)
        probs = rf.predict_proba(X_test)
        schools = list(rf.classes_)
        results = pd.concat((y_test.reset_index(drop=True), pd.DataFrame(probs)), axis=1)
        print(pd.Series(rf.feature_importances_, index = x.columns).sort_values(ascending=False))
        print("Accuracy", rf.score(X_test, y_test))
        print("AveProb", results.apply(lambda x: x[schools.index(x.Campus)], axis=1).mean())
        return
    out = []
    for j in range(5):
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_test)
        probs = rf.predict_proba(X_test)
        schools = list(rf.classes_)
        results = pd.concat((y_test.reset_index(drop=True), pd.DataFrame(probs)), axis=1)
        out.append(rf.score(X_test, y_test))
        out.append(results.apply(lambda x: x[schools.index(x.Campus)], axis=1).mean())
    return [average(out[::2]), average(out[1::2])]

for i in range(7):
    # Labels only
    x = df.iloc[:, 19:(20+i)]
    x = pd.get_dummies(x)
    res.append([i+1, False] + get_results(x))
    
    # Labels and distances 
    x = df.iloc[:, list(range(19, 20+i)) + list(range(26, 27+i))]
    x = pd.get_dummies(x)
    res.append([i+1, True] + get_results(x))

output = pd.DataFrame(res, columns=['Labels', 'Distances', 'Accuracy', 'AveProb'])
output

Unnamed: 0,Labels,Distances,Accuracy,AveProb
0,1,False,0.401658,0.269362
1,1,True,0.457392,0.381903
2,2,False,0.40914,0.299063
3,2,True,0.454928,0.382284
4,3,False,0.453763,0.34039
5,3,True,0.452375,0.380327
6,4,False,0.463934,0.345244
7,4,True,0.463306,0.385233
8,5,False,0.458423,0.348057
9,5,True,0.453315,0.380075


In [48]:
output.to_csv('Location_Choice_Reformat_Output.csv', index = False)

In [None]:
"""
import matplotlib.pyplot as plt
plt.figure(figsize = (12, 12))
plt.tight_layout()
plt.subplot(1, 2, 1)
pd.Series(y_pred).value_counts().plot.pie()
plt.title("Predicted Campuses")
plt.subplot(1, 2, 2)
pd.Series(y_test).value_counts().plot.pie()
plt.title("Actual Campuses")
plt.show()
"""

In [None]:
# import pickle
# pickle.dump(rf, open('Location_Choice_Model', 'wb'))

# To open:
# model = pickle.load(open(filename, 'rb'))