# Data enrichment

### <u>Content:<u>

[1) Load data sets

[2) Add week day and month to reservation data
    
[3) Estimate part of the Swiss population that is on holiday for any given date
    
[4) Add coordinates for start and destination 

[5) Get weather data
    
[6) Get capacity data for trains in 2020 according to the expected train formation


In [32]:
#Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
from scipy.spatial.distance import cdist
import datetime
import meteomatics.api as api
import datetime as dt
%matplotlib inline
rcParams['figure.figsize']=10,8

#### 1) Load data

In [33]:
#read the reservation data 
df = pd.read_csv('data_raw/reservation_data_2019-2021_incl_capacity.csv', 
                 parse_dates=["date"], date_parser=lambda x: pd.to_datetime(x, format="%Y-%m-%d %H:%M:%S"))         

# read holiday data
df_schulferien = pd.read_csv('data_preprocessed/Schulferien.csv', dtype={"canton": "string", "population": "int32"})
df_schulferien['start'] = pd.to_datetime(df_schulferien['holidays_start'])
df_schulferien['end'] = pd.to_datetime(df_schulferien['holidays_end'])
df_schulferien = df_schulferien.drop(columns=["holidays_start", "holidays_end"])

# train station coordinates data
df_coordinates = pd.read_csv('data_preprocessed/dienststellen.csv')
df_coordinates = df_coordinates[["abk_bahnhof", "lat", "lon"]]
df_coordinates = df_coordinates[df_coordinates['abk_bahnhof'].notna()]

# jahresformation
df_jahresformation = pd.read_csv("data_preprocessed/jahresformation.csv", dtype={"Block Bezeichnung": "string"})

# kapazität
df_kapazität = pd.read_csv("data_preprocessed/rollmaterial-matching.csv")


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [34]:
df_weather = pd.read_csv("data_preprocessed/weather.csv", parse_dates=["validdate"])
df_weather = df_weather.rename(columns={"validdate": "date"}).set_index("date")

# read model data 
parse_dates = ["start", "end", "travel_date", "latest_res_dt"]
df_model = pd.read_csv("model_data.csv", parse_dates=parse_dates)        
        
display(df_model.tail(2))

Unnamed: 0,start,start_loc,count,end,end_loc,capacity,line,travel_date,latest_res_dt
115772,2021-10-24 15:47:30,LQ,11.0,2021-10-24 15:56:24,SA,35.0,IC 3,2021-10-24,2021-10-24 15:47:30
115773,2021-10-24 15:56:24,SA,18.0,2021-10-24 16:53:00,ZUE,35.0,IC 3,2021-10-24,2021-10-24 15:56:24


In [35]:
df_weather = df_weather.resample('d')[['leisure_biking:idx', "t_2m:C", "precip_24h:mm"]].mean()
df_weather = df_weather.reset_index()
df_weather["date"] = df_weather["date"].apply(lambda x: x.replace(tzinfo=None))
df_model = pd.merge(df_model, df_weather, left_on='travel_date', right_on='date')

In [37]:
df_model.to_csv('model/model_data_weather.csv', index=False)

#### 2) Week day <a name="stat"></a>

- Add a feature for weekday: 'weekday' and 'month'

In [None]:
df['weekday'] = df['date'].dt.dayofweek
df['month'] = df['date'].dt.month

#### 3) Holidays <a name="stat"></a>
- Add a feature for number of people in holiday canton: n_holiday

In [None]:
# for each date, get the number of people in Switzerland who 
# are either on school holiday or national holiday..
def get_holiday_people(date):
    filtered_holidays = df_schulferien[(df_schulferien['start']<=date)&(df_schulferien['end']>=date)]
    is_national_holiday = (filtered_holidays["canton"]=="national").sum()
    if is_national_holiday:
        people = 7917100
    elif not(filtered_holidays.empty):
        filtered_holidays = filtered_holidays[filtered_holidays["canton"]!="national"]
        people = sum(filtered_holidays["population"])
    else: people=0
    return people

In [None]:
# filter df, only 2021 data
# df=df[df['date']>='2021-01-01']
df['holiday_people'] = df.apply(lambda row : get_holiday_people(row['date']), axis = 1)

# df_model['holiday_people'] = df.apply(lambda row : get_holiday_people(row['date']), axis = 1)

#### 4) Coordinates <a name="hr"></a>

In [16]:
# full join for start train station 
df = pd.merge(df, df_coordinates, left_on='bp_from', right_on='abk_bahnhof')
df = df.drop(columns=['abk_bahnhof']).rename(columns={"lat": "lat_from", "lon": "lon_from"})

# full join for destination
df = pd.merge(df, df_coordinates, left_on='bp_to', right_on='abk_bahnhof')
df = df.drop(columns=['abk_bahnhof']).rename(columns={"lat": "lat_to", "lon": "lon_to"})
display(df.head(2))

Unnamed: 0.1,Unnamed: 0,res_id,res_dt,date,train_nr,line,reserved,capacity,bp_from,bp_to,...,dep_soll,arr_ist,arr_soll,res_delta_ist,res_delta_soll,res_delta_valid,lat_from,lon_from,lat_to,lon_to
0,0,0,2019-03-29 00:00:00,2019-04-01,510,IC 5,1,,ZUE,NE,...,2019-04-01 07:03:00,2019-04-01 08:33:00,2019-04-01 08:32:00,284667.0,284634,False,47.378177,8.540212,46.996727,6.935702
1,283,290,2019-04-06 00:00:00,2019-04-06,1528,IC 5,1,,ZUE,NE,...,2019-04-06 16:30:00,2019-04-06 18:02:00,2019-04-06 18:01:00,59422.0,59418,False,47.378177,8.540212,46.996727,6.935702


#### 5) Weather <a name="corr"></a>

In [108]:
# summarize daily weather data
df_weather['date'] = pd.to_datetime(df_weather.date)

df_weather = df_weather.resample('d', on='date')[['leisure_biking:idx', "t_2m:C", "precip_24h:mm"]].agg(['mean'])
df_weather=df_weather.reset_index()
df=pd.merge(df, df_weather, left_on='date', right_on='date')

display(df.tail(3))
display(df_weather.tail(3))


AttributeError: 'DataFrame' object has no attribute 'date'

In [102]:
##import meteo data 
def get_meteodata(lat, lon, year, month, day):
    username = 'can-guru_otth'
    password = 'eyk47W6ATq'
    coordinates = [(lat, lon)]
    model =     'mix'
    startdate = dt.datetime(year=year, month=month, day=day, hour=0, minute=0, second=0)
    enddate =   startdate 
    interval = dt.timedelta(hours=0)
    format = 'csv' 

    today=dt.datetime.now()
    delta_days =(startdate-today).days
    
    # for longer forecast, use standard values for leisure biking idx + precipiation
    if delta_days>16:
        parameters = ['t_2m:C']#,'precip_24h:mm', 'leisure_biking:idx']
        df = api.query_time_series(coordinates, startdate, enddate, interval, parameters, username, password, model=model)
        # insert mean values
        df['leisure_biking:idx']= 0.39
        df['precip_24h:mm']=3.9

        
    else: 
        parameters = ['t_2m:C','precip_24h:mm', 'leisure_biking:idx']
        df = api.query_time_series(coordinates, startdate, enddate, interval, parameters, username, password, model=model)

    return df
    
# example function call
get_meteodata(47.378177, 8.540212, 2022, 3, 29)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,t_2m:C,precip_24h:mm,leisure_biking:idx
lat,lon,validdate,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
47.378177,8.540212,2022-03-29 00:00:00+00:00,5.5,0.0,0.5


#### 6) Capacity <a name="corr"></a>

 - Für jede Reservation: Zugnummer im Jahresformation-Datensatz abrufen
 - Beachte: richtiges Jahr wählen, häufigste Formation
 - entsprechende Kapazität auslesen

In [None]:
#combine data from capacity table and annual formation
list_kapazitäten=df_kapazität["Block Bezeichnung in Jahresformation Fpl-2022"].tolist()
df_jahresformation = df_jahresformation[df_jahresformation["Block Bezeichnung"].isin(list_kapazitäten)]
df_jahresformation=df_jahresformation[["Block Bezeichnung", "Zug", "Beginn Fahrplanperiode"]]
df_jahresformation = pd.merge(df_jahresformation, df_kapazität, left_on='Block Bezeichnung', right_on='Block Bezeichnung in Jahresformation Fpl-2022').drop(columns=["Block Bezeichnung in Jahresformation Fpl-2022"])
df_jahresformation.tail(2)

In [12]:
median_df=df[df["capacity"].notnull()]
median=np.median(median_df['capacity'])
print('Median Kapazität', median)

Median Kapazität 12.0


In [19]:
def fill_capacity(row):
    train=row.train_nr   
    formation = df_jahresformation[df_jahresformation["Zug"]==train]
    capacity=formation["No. of hooks"]
    return capacity

In [20]:
# for the rows where there is no capacity included
# first try to check with the jahresformation lookup table
mask = df.capacity.isnull()
df['capacity'] = df[mask].apply(fill_capacity, axis=1)

# secondly, fill the median capacity 
df=df.fillna(value={"capacity": median})

AttributeError: module 'pandas' has no attribute 'average'

In [None]:
df.head(10)