# Read data

In [1]:
import pandas as pd
import numpy as np

In [3]:
# read data with the number of train rides and the minutes of delay
data_rides = pd.read_csv("../dat/Zugfahrten_2016_12.csv", sep = ";", encoding = "latin-1")

# rename columns (english translation)
data_rides.columns = ["Station or stop", "Country", "Date", "Number of train rides", "Minutes of delay"]

In [4]:
# read data with the coordinates of the stations
data_stations = pd.read_csv("../dat/GEO_Bahnstellen_EXPORT.csv", sep = ";", encoding = "latin-1")

# rename columns (english translation)
data_stations.columns = ["Station or stop", "Name", "Country", "Coordinate Latitude", "Coordinate Longitude"]

In [6]:
# merge the two dataframes
data = pd.merge(data_rides, data_stations, on = "Station or stop")
data = data.drop(["Country_x"], axis = 1)

# rename the column "Country_y" to "Country"
data = data.rename(columns = {"Country_y": "Country"})
len(data)

766028

# Take a look...

In [None]:
data.head()

In [None]:
# histogram of the number of train rides
data["Number of train rides"].hist(bins = 30, rwidth = 0.8)

In [None]:
# histogram of the minutes of delay
data["Minutes of delay"].hist(bins = 30, rwidth = 0.8)

In [None]:
# histogram of the minutes of delay per country
data.groupby("Country")["Minutes of delay"].mean().plot(kind = "bar")

# Data cleaning

In [None]:
# 1) Include only values for Germany
data = data[data['Country'] == "DEUTSCHLAND"]

# check if it worked
print(len(data))
print(data['Country'].unique())

In [None]:
# 2) Missing values

# check for missing values
data.isnull().sum()

# drop rows with missing values
data = data.dropna(axis = 0, how = "any")

# check if it worked
print(len(data))
data.isnull().sum()

In [None]:
# 3) Calculate mean for every station

# drop columns that are not needed
data_mean = data.drop(columns = ["Date", "Country", "Coordinate Latitude", "Coordinate Longitude", "Name"])
data_mean = data_mean.groupby(["Station or stop"]).mean()

# get the names of the stations
data_names = data.drop(columns = ["Date", "Country", "Number of train rides", "Minutes of delay"])
data_names = data_names.drop_duplicates()

# merge the two dataframes
data_mean = pd.merge(data_names, data_mean, on = "Station or stop", how = "left")

In [None]:
# check if it worked
data_mean

In [None]:
# check if it worked
print(len(data_mean))

print(len(data_mean["Name"].unique()))

print(len(data_mean["Station or stop"].unique()))

In [None]:
# 4) drop stations with less than 10 train rides in mean
data_mean = data_mean[data_mean["Number of train rides"] >= 10]