In [30]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import model_selection

Person 
- FIRST: First class traveler (0 = no, 1 = yes)
- AGE: 1: age≤24, 2: 24<age≤39, 3: 39<age≤54, 4: 54<age≤ 65, 5: 65 <age, 6: not known
- MALE: Traveler’s Gender 0: female, 1: male
- INCOME: Traveler’s income per year [thousand CHF] 
  -- 0 or 1: under 50, 2: between 50 and 100, 3: over 100, 4: unknown
  
Trip context 
- PURPOSE: 1: Commuter, 2: Shopping, 3: Business, 4: Leisure, 5: Return from work, 6: Return from shopping, 7: Return from business,  8: Return from leisure, 9: other                          
- WHO: Who pays (0: unknown, 1: self, 2: employer, 3: half-half)
- LUGGAGE: 0: none, 1: one piece, 3: several pieces

Alternative attributes: 
- GA: Variable capturing the effect of the Swiss annual season ticket for the rail system and most local public transport. It is 1 if the individual owns a GA, zero otherwise. (if GA=1, SM_CO=0, TRAIN_CO=0)

- TRAIN_TT: Train travel time [minutes]. Travel times are door-to-door making assumptions about car-based distances
- TRAIN_CO: Train cost [CHF]. If the traveler has a GA, this cost equals the cost of the annual ticket.
- TRAIN_HE: Train headway [minutes]

- SM_TT: SM travel time [minutes] considering the future Swissmetro speed of 500 km/h
- SM CO: SM cost [CHF] calculated at the current relevant rail fare, without considering GA, multiplied by a fixed factor (1.2) to reflect the higher speed.
- SM_HE: SM headway [minutes]
- SM_SEATS: Seats configuration in the Swissmetro (dummy). Airline seats (1) or not (0).

- CAR_TT: 
- CAR_CO: 

Choice:
- CHOICE: Choice indicator. 0: unknown, 1: Train, 2: SM, 3: Car

Availability: 
- TRAIN_AV
- CAR_AV
- SM_AV                                                     

In [31]:
def recode(data, var, from_value, to_value, num_levels):
    if from_value != None:
        data.loc[data[var]==from_value,var]=to_value
    dmy = pd.get_dummies(data[var])
    dmy.columns = [var+str("_")+str(i) for i in range(num_levels)]
    if dmy.columns[0] not in data.columns: # dummies not added yet
        data = pd.concat([data, dmy], axis=1)
    return data

In [32]:
def recodeVarMultiLevels(data, var, dic):
    for from_value in dic:
        data.loc[data[var]==from_value, var]=dic[from_value]
    return data

def removeLevel(data, var, level):
    return data.loc[data[var]!=level]

In [33]:
def printValueCounts(data, var_list):
    for var in var_list:
        print (data[var].value_counts())
    return

def scaleVar(data, var, factor):
    data[var]=data[var]*factor

In [34]:
data = pd.read_csv("../data/swissmetro.csv")

In [35]:
print (len(data), "observations")
print ("car users:" , data["SURVEY"].sum()/9.0)
print ("rail users:" , (data["SURVEY"]==0).sum()/9.0)

10728 observations


In [39]:
## Socio-demo Variables
printValueCounts(data, ['AGE','MALE','INCOME','FIRST'])
data = recode(data, 'MALE',None,None,2)
# AGE
data = removeLevel(data, 'AGE', 6)
data = recode(data, 'AGE',None,None,5)
# INCOME
data = recode(data, 'INCOME',0,1,4)
printValueCounts(data, ['AGE','MALE','INCOME','FIRST'])
# FIRST
data = recode(data, 'FIRST',None,None,2)

3    3834
2    3339
4    2025
5     810
1     711
6       9
Name: AGE, dtype: int64
1    8046
0    2682
Name: MALE, dtype: int64
3    4041
2    3744
1    1719
4     918
0     306
Name: INCOME, dtype: int64
0    5679
1    5049
Name: FIRST, dtype: int64
3    3834
2    3339
4    2025
5     810
1     711
Name: AGE, dtype: int64
1    8037
0    2682
Name: MALE, dtype: int64
3    4041
2    3735
1    2025
4     918
Name: INCOME, dtype: int64
0    5670
1    5049
Name: FIRST, dtype: int64


In [40]:
# GA
data = recode(data, 'GA',None,None,2)
print (data['GA'].value_counts())

0    9198
1    1521
Name: GA, dtype: int64


In [41]:
## Trip context 
printValueCounts(data, ['WHO','PURPOSE','LUGGAGE'])

# WHO
data = recode(data, 'WHO', 0,1,3)

# PURPOSE
# 1: Commuter, 2: Shopping, 3: Business, 4: Leisure, 
# 5: Return from work, 6: Return from shopping, 
# 7: Return from business,  8: Return from leisure, 9: other  
dic_purp_recode = {5:1, 6:2, 7:3, 8:4}
data = recodeVarMultiLevels(data, 'PURPOSE', dic_purp_recode)
print (data['PURPOSE'].value_counts())
data = removeLevel(data, 'PURPOSE', 9)
print (data['PURPOSE'].value_counts())
data = recode(data, 'PURPOSE',None,None,4)

# LUGGAGE
data = recode(data, 'LUGGAGE',None,None,3)
print (data['LUGGAGE'].value_counts())

1    5805
2    3564
3    1026
0     324
Name: WHO, dtype: int64
3    5184
4    2304
1    1575
2    1278
5     144
7     144
6      63
9      18
8       9
Name: PURPOSE, dtype: int64
1    6489
0    3969
3     261
Name: LUGGAGE, dtype: int64
3    5328
4    2313
1    1719
2    1341
9      18
Name: PURPOSE, dtype: int64
3    5328
4    2313
1    1719
2    1341
Name: PURPOSE, dtype: int64
1    6471
0    3969
3     261
Name: LUGGAGE, dtype: int64


### Alternative attributes

In [42]:
data['SM_CO'] = data['SM_CO'] * (data['GA']==0) # if GA==1, SM_CO=0, o.w. SM_CO=SM_CO
data['TRAIN_CO'] = data['TRAIN_CO'] * (data['GA']==0) # if GA==1, SM_CO=0, o.w. SM_CO=SM_CO
x_names = ['TRAIN_TT','TRAIN_HE', 'TRAIN_CO', 'SM_TT','SM_HE', 'SM_CO', 'CAR_TT', 'CAR_CO']
for var in x_names:
    scaleVar(data, var, 0.01)
data = removeLevel(data, 'CHOICE', 0)
data = recode(data, 'CHOICE', None, None,3)

In [43]:
printValueCounts(data, ['CAR_AV', 'TRAIN_AV', 'SM_AV'])

1    9027
0    1665
Name: CAR_AV, dtype: int64
1    10692
Name: TRAIN_AV, dtype: int64
1    10692
Name: SM_AV, dtype: int64


In [46]:
## Export data
from collections import OrderedDict
z_levels = OrderedDict()

levels = [('MALE',2), ('AGE',5), ('INCOME',4), ('FIRST',2), ('WHO',3), ("PURPOSE",4), ("LUGGAGE",3), ('GA',2)]
for elem in levels:
    z_levels[elem[0]]=elem[1]
z_list = [[var+"_"+str(i) for i in range(z_levels[var])] for var in z_levels]
z_names = []
for elem in z_list:
    z_names.extend(elem)
# get z
z = data[z_names].values
print ("z_levels", z_levels)
print ("z_names", z_names)

z_levels OrderedDict([('MALE', 2), ('AGE', 5), ('INCOME', 4), ('FIRST', 2), ('WHO', 3), ('PURPOSE', 4), ('LUGGAGE', 3), ('GA', 2)])
z_names ['MALE_0', 'MALE_1', 'AGE_0', 'AGE_1', 'AGE_2', 'AGE_3', 'AGE_4', 'INCOME_0', 'INCOME_1', 'INCOME_2', 'INCOME_3', 'FIRST_0', 'FIRST_1', 'WHO_0', 'WHO_1', 'WHO_2', 'PURPOSE_0', 'PURPOSE_1', 'PURPOSE_2', 'PURPOSE_3', 'LUGGAGE_0', 'LUGGAGE_1', 'LUGGAGE_2', 'GA_0', 'GA_1']


In [47]:
x_names = ['TRAIN_TT', 'TRAIN_HE', 'TRAIN_CO', 'SM_TT','SM_HE', 'SM_SEATS','SM_CO', 'CAR_TT', 'CAR_CO']
x = data[x_names].values
y = data['CHOICE'].values

In [48]:
y01 = data[['CHOICE_'+str(i) for i in range(3)]].values
ID = data['ID'].values

In [49]:
car_av = data['CAR_AV'].values

In [50]:
data_dict = {"x_names": x_names, "x": x, "z_names": z_names, \
             "z": z, "z_levels": z_levels, "y01": y01, "y":y, "ID": ID, 'car_av': car_av}

In [51]:
data_dict["x_names"]

['TRAIN_TT',
 'TRAIN_HE',
 'TRAIN_CO',
 'SM_TT',
 'SM_HE',
 'SM_SEATS',
 'SM_CO',
 'CAR_TT',
 'CAR_CO']

In [53]:
import pickle
pickle.dump(data_dict, open("../data/swissmetro_all.pkl","wb"))

In [54]:
def split(N):
    '''
    Split index list = range(N)
    '''
    train_ind, dev_test_ind = model_selection.train_test_split(range(N), train_size = 0.7, test_size = 0.3, random_state=8)
    dev_ind, test_ind = model_selection.train_test_split(dev_test_ind, train_size = 0.5, test_size = 0.5, random_state=9)
    print ("train size:", len(train_ind))
    print ("dev size:", len(dev_ind))
    print ("test size:", len(test_ind))
    return train_ind, dev_ind, test_ind

def selectData(data, ind):
    '''
    Select data based on ind list 
    '''
    data_sel = {}
    for key in data.keys():
        if key in ["x", "y", "z", "y01", "car_av"]:
            data_sel[key] = data[key][ind]
        else: # copy parameters over 
            data_sel[key] = data[key]
    return data_sel

In [55]:
train_ind, dev_ind, test_ind = split(len(x))
data_train = selectData(data_dict, train_ind)
data_dev = selectData(data_dict, dev_ind)
data_test = selectData(data_dict, test_ind)

train size: 7484
dev size: 1604
test size: 1604


In [56]:
import pickle
pickle.dump(data_train, open("../data/train.pkl","wb"))
pickle.dump(data_dev, open("../data/dev.pkl","wb"))
pickle.dump(data_test, open("../data/test.pkl","wb"))

In [57]:
data_train['x_names']

['TRAIN_TT',
 'TRAIN_HE',
 'TRAIN_CO',
 'SM_TT',
 'SM_HE',
 'SM_SEATS',
 'SM_CO',
 'CAR_TT',
 'CAR_CO']

In [58]:
data_train['z_names']

['MALE_0',
 'MALE_1',
 'AGE_0',
 'AGE_1',
 'AGE_2',
 'AGE_3',
 'AGE_4',
 'INCOME_0',
 'INCOME_1',
 'INCOME_2',
 'INCOME_3',
 'FIRST_0',
 'FIRST_1',
 'WHO_0',
 'WHO_1',
 'WHO_2',
 'PURPOSE_0',
 'PURPOSE_1',
 'PURPOSE_2',
 'PURPOSE_3',
 'LUGGAGE_0',
 'LUGGAGE_1',
 'LUGGAGE_2',
 'GA_0',
 'GA_1']