In [80]:
# Source code from: https://github.com/timothyb0912/pylogit

In [1]:
from collections import OrderedDict
import numpy as np
import pandas as pd
import pylogit as pl

### Load and clean Swiss metro data

In [2]:
sm_wide = pd.read_csv("./data/swissmetro.dat", sep="\t")
condition = (sm_wide.PURPOSE.isin([1,3]) & (sm_wide.CHOICE != 0))
sm_wide = sm_wide.loc[condition].copy()
sm_wide.shape

(6768, 28)

In [3]:
sm_wide.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
GROUP,2,2,2,2,2,2,2,2,2,2
SURVEY,0,0,0,0,0,0,0,0,0,0
SP,1,1,1,1,1,1,1,1,1,1
ID,1,1,1,1,1,1,1,1,1,2
PURPOSE,1,1,1,1,1,1,1,1,1,1
FIRST,0,0,0,0,0,0,0,0,0,0
TICKET,1,1,1,1,1,1,1,1,1,1
WHO,1,1,1,1,1,1,1,1,1,1
LUGGAGE,0,0,0,0,0,0,0,0,0,1
AGE,3,3,3,3,3,3,3,3,3,2


### Convert Wide to Long format

In [4]:
choice = "CHOICE"
# Define alternative variables
alt_vars = {u'travel_time': dict([(1, 'TRAIN_TT'),
                                   (2, 'SM_TT'),
                                   (3, 'CAR_TT')]),
              u'travel_cost': dict([(1, 'TRAIN_CO'),
                                    (2, 'SM_CO'),
                                    (3, 'CAR_CO')]),
              u'headway': dict([(1, 'TRAIN_HE'),
                                (2, 'SM_HE')]),
              u'seat_configuration': dict([(2, "SM_SEATS")])}

avai_vars = {1: 'TRAIN_AV',
              2: 'SM_AV', 
              3: 'CAR_AV'}

In [5]:
def wide2long(df_wide, choice_col, alternative_vars, available_vars):
    """
    This function convert wide format data frame to long format data frame
    :param df_wide: Pandas Data frame in wide format
    :param choice_col: str, name of choice column
    :param alternative_vars: a dictionary of alternative variables
    :param available_vars: a dictionary of available variables
    :return df_long: Pandas Data frame in long format
    """
    # Get individual variables
    no_ind_vars = not_ind_vars(choice_col, alternative_vars, available_vars)
    ind_vars = list(set(sm_wide.columns) - set(no_ind_vars))
    # Generate custom_id column
    df_wide["custom_id"] =  np.arange(df_wide.shape[0], dtype=int) + 1
    # Convert wide to long using pylogit function
    df_long = pl.convert_wide_to_long(df_wide, ind_vars, alternative_vars, available_vars,
                                      "custom_id", choice_col, "mode_id")    
    return df_long

def not_ind_vars(choice_col, alternative_vars, available_vars):
    not_ind_vars = [choice_col]
    for d in alternative_vars.values():
        not_ind_vars.extend(list(d.values()))
    not_ind_vars.extend(list(avai_vars.values()))
    return not_ind_vars

In [6]:
# Convert wide to long format
sm_long = wide2long(sm_wide, choice, alt_vars, avai_vars)

In [7]:
sm_long.shape

(19143, 22)

In [8]:
sm_long.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
custom_id,1.0,1.0,1.0,2.0,2.0,2.0,3.0,3.0,3.0,4.0
mode_id,1.0,2.0,3.0,1.0,2.0,3.0,1.0,2.0,3.0,1.0
CHOICE,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
GROUP,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
GA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
WHO,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
TICKET,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
PURPOSE,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
FIRST,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
INCOME,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


### Save long format

In [9]:
sm_long.to_csv("./data/swissmetro_long.csv", index=False)