# Swissmetro data

In [1]:
import pandas as pd
import biogeme.database as db
from biogeme.expressions import Variable
import biogeme.biogeme_logging as blog
import biogeme.biogeme as bio
from biogeme import models
from biogeme.expressions import Beta, Variable, log, exp

import numpy as np

In [2]:
df = pd.read_csv('swissmetro_original.dat', sep='\t')
database = db.Database('swissmetro', df) 


In [3]:
# replace AGE == 6 with 5

GROUP = Variable('GROUP')
SURVEY = Variable('SURVEY')
SP = Variable('SP')
ID = Variable('ID')

TRAIN_AV = Variable('TRAIN_AV')
CAR_AV = Variable('CAR_AV')
SM_AV = Variable('SM_AV')
TRAIN_TT = Variable('TRAIN_TT')
TRAIN_CO = Variable('TRAIN_CO')
TRAIN_HE = Variable('TRAIN_HE')
SM_TT = Variable('SM_TT')
SM_CO = Variable('SM_CO')
SM_HE = Variable('SM_HE')
SM_SEATS = Variable('SM_SEATS')
CAR_TT = Variable('CAR_TT')
CAR_CO = Variable('CAR_CO')
CHOICE = Variable('CHOICE')

PURPOSE = Variable('PURPOSE')
FIRST = Variable('FIRST')
TICKET = Variable('TICKET')
WHO = Variable('WHO')
LUGGAGE = Variable('LUGGAGE')
AGE = Variable('AGE')
MALE = Variable('MALE')
INCOME = Variable('INCOME')
GA = Variable('GA')
ORIGIN = Variable('ORIGIN')
DEST = Variable('DEST')


exclude = ((PURPOSE != 1) * (PURPOSE != 3) + (CHOICE == 0)) > 0
database.remove(exclude)

TRAIN_AV_SP = database.define_variable('TRAIN_AV_SP', TRAIN_AV * (SP != 0))
CAR_AV_SP = database.define_variable('CAR_AV_SP', CAR_AV * (SP != 0))

TRAIN_COST = database.define_variable('TRAIN_COST', TRAIN_CO * (GA == 0))

TRAIN_TT_SCALED = database.define_variable('TRAIN_TT_SCALED', TRAIN_TT / 100)
TRAIN_COST_SCALED = database.define_variable('TRAIN_COST_SCALED', TRAIN_COST / 100)
TRAIN_HE_SCALED = database.define_variable('TRAIN_HE_SCALED', TRAIN_HE / 100)


SM_COST = database.define_variable('SM_COST', SM_CO * (GA == 0))
SM_TT_SCALED = database.define_variable('SM_TT_SCALED', SM_TT / 100)
SM_COST_SCALED = database.define_variable('SM_COST_SCALED', SM_COST / 100)
SM_HE_SCALED = database.define_variable('SM_HE_SCALED', SM_HE / 100)
SM_SEATS_SCALED = database.define_variable('SM_SEATS_SCALED', SM_SEATS / 1)

CAR_TT_SCALED = database.define_variable('CAR_TT_SCALED', CAR_TT / 100)
CAR_CO_SCALED = database.define_variable('CAR_CO_SCALED', CAR_CO / 100)



# replace the INCOME ==0 with 1
INCOME = database.define_variable('INCOME2', (INCOME==0)* 1 + (INCOME!=0)*INCOME)
AGE = database.define_variable('AGE2', (AGE==6)* 5 + (AGE!=6)*AGE)

In [None]:
# Rename columns
df2 = pd.DataFrame(
    {
        'ID': database.data['ID'],
        'avail_1': database.data['TRAIN_AV_SP'],
        'avail_2': database.data['SM_AV'],
        'avail_3': database.data['CAR_AV_SP'],
        'x_1_1': database.data['TRAIN_TT_SCALED'],
        'x_1_2': database.data['TRAIN_COST_SCALED'],
        'x_1_3': database.data['TRAIN_HE_SCALED'],
        'x_1_4': 0,

        'x_2_1': database.data['SM_TT_SCALED'],
        'x_2_2': database.data['SM_COST_SCALED'],
        'x_2_3': database.data['SM_HE_SCALED'],
        'x_2_4': database.data['SM_SEATS_SCALED'],
        
        'x_3_1': database.data['CAR_TT_SCALED'],
        'x_3_2': database.data['CAR_CO_SCALED'],
        'x_3_3': 0,
        'x_3_4': 0,
        'choice': database.data['CHOICE'],
        'class': database.data['FIRST'],
        'ga': database.data['GA'],
        'luggage': database.data['LUGGAGE'],
        'gender': database.data['MALE'],
        'who': database.data['WHO'],
        'income': database.data['INCOME2'],
        'age': database.data['AGE2']        
    }
)

# Ensure that the values are numeric
for column in df2.columns:
    if column not in ['ID', 'avail_1', 'avail_2', 'avail_3', 'CHOICE', 'Class', 'GA', 'Luggage', 'Gender', 'Who', 'Income', 'Age']:
        df2[column] = pd.to_numeric(df2[column], errors='coerce')
    else:
        df2[column] = df2[column].astype(int)

df2.to_csv('swissmetro.csv', index=False)