In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn import metrics

### a) Predict the journey duration. You may use any Start information in the data set. Also consider creating new features that may help prediction quality (Hint: In the coding week, we discussed creating polynomial features. How would you engineer features for periodic quantities with known period lengths, e.g, a day, a year, …?).

In [24]:
data = pd.read_csv("../data/processed/dortmund_trips.csv", index_col = 0)

In [25]:
data.head()

Unnamed: 0,datetime_start,b_number_start,latitude_start,p_name_start,longitude_start,datetime_end,latitude_end,p_name_end,longitude_end,trip_duration,distance,weekday,weekend,day,month,hour
0,2019-01-20 16:22:00,50641,51.506312,Hainallee / Südbad,7.470531,2019-01-20 17:00:00,51.493966,TU Dortmund Emil-Figge-Straße 50,7.418008,38,3.89729,6,True,20,1,16
1,2019-01-20 02:31:00,50425,51.517155,Hauptbahnhof/Bahnhofsvorplatz,7.459931,2019-01-20 02:43:00,51.513069,Unionstr.,7.448886,12,0.891383,6,True,20,1,2
2,2019-01-20 11:32:00,53006,51.509557,Ritterhausstr.,7.446949,2019-01-20 13:33:00,51.517155,Hauptbahnhof/Bahnhofsvorplatz,7.459931,121,1.235649,6,True,20,1,11
3,2019-01-20 14:38:00,53006,51.517155,Hauptbahnhof/Bahnhofsvorplatz,7.459931,2019-01-20 14:53:00,51.500725,Polizeipräsidium,7.459819,15,1.827997,6,True,20,1,14
4,2019-01-20 17:02:00,53006,51.500725,Polizeipräsidium,7.459819,2019-01-20 17:16:00,51.514029,Schwanenwall,7.47257,14,1.724677,6,True,20,1,17


In [26]:
# this is our y
duration = data["trip_duration"]

# this is our x
data = data.drop(columns=["trip_duration", "datetime_start", "p_name_start", "b_number_start", "datetime_end", "p_name_end"], axis = 1)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(data, duration, test_size = 0.3)

In [30]:
st_scaler = StandardScaler()
X_train_scaled = st_scaler.fit_transform(X_train)

In [31]:
lin = LinearRegression()
lin.fit(X_train_scaled, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [32]:
X_test_scaled = st_scaler.transform(X_test)
y_predict = lin.predict(X_test_scaled)

In [34]:
print("RMSE: ", np.sqrt(metrics.mean_squared_error(y_test, y_predict)))
print("MAE: ", metrics.mean_absolute_error(y_test, y_predict))

RMSE:  80.41408025896487
MAE:  38.72054621697306


In [35]:
from sklearn.decomposition import PCA

In [38]:
pca = PCA(n_components=10)

In [42]:
X_train_transformed =pca.fit_transform(X_train_scaled)

In [43]:
lin = LinearRegression()

In [44]:
lin.fit(X_train_transformed, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [45]:
X_test_scaled = st_scaler.transform(X_test)
X_test_transformed = pca.transform(X_test_scaled)
y_predict = lin.predict(X_test_transformed)

In [46]:
print("RMSE: ", np.sqrt(metrics.mean_squared_error(y_test, y_predict)))
print("MAE: ", metrics.mean_absolute_error(y_test, y_predict))

RMSE:  80.41408025896487
MAE:  38.72054621697306


### b) All cities in your datasets have universities. Based on Start information, predict whether a trip will be towards, or away from the university. Analyze your predictive performance on different subsets of your data (months, …). Do you see differences?

There are multiple university-stations (can be seen in the first map of task 2)
- TU Dortmund Seminarraumgebäude 1
- TU Dortmund Hörsaalgebäude 2
- Universität/S-Bahnhof
- TU Dortmund Emil-Figge-Straße 50
- FH-Dortmund Emil-Figge-Straße 42

There are 2 ways to determine whether a trip is towards/away from the university

1)
check if start/end-station is one of the university-stations
create 2 columns:
- TowardsUniveristy: Yes/No
- AwayFromUniveristy: Yes/No

We need 2 columns (and not solely one) because there is also the case that a trip was done independently from the university


2)
check if the route of a trip is done in direction to the university (or away from the university)
Therefore:

- determine the route/vector of a trips
- chech if this vector is directed in direction to/away from the university

In [52]:
trips = pd.read_csv("../data/processed/dortmund_trips.csv", index_col = 0)

In [47]:
university_stations = ["TU Dortmund Seminarraumgebäude 1", "TU Dortmund Hörsaalgebäude 2", "Universität/S-Bahnhof", "TU Dortmund Emil-Figge-Straße 50", "FH-Dortmund Emil-Figge-Straße 42"]

In [48]:
len(university_stations)

5

In [51]:
"TU Dortmund Seminarraumgebäude 1" in university_stations

True

In [54]:
trips['TowardsUniversity'] = trips['p_name_end'].apply(lambda x: 1 if x in university_stations else 0)

In [55]:
trips['AwayFromUniveristy'] = trips['p_name_start'].apply(lambda x: 1 if x in university_stations else 0)

In [58]:
trips

Unnamed: 0,datetime_start,b_number_start,latitude_start,p_name_start,longitude_start,datetime_end,latitude_end,p_name_end,longitude_end,trip_duration,distance,weekday,weekend,day,month,hour,TowardsUniversity,AwayFromUniveristy
0,2019-01-20 16:22:00,50641,51.506312,Hainallee / Südbad,7.470531,2019-01-20 17:00:00,51.493966,TU Dortmund Emil-Figge-Straße 50,7.418008,38,3.897290,6,True,20,1,16,1,0
1,2019-01-20 02:31:00,50425,51.517155,Hauptbahnhof/Bahnhofsvorplatz,7.459931,2019-01-20 02:43:00,51.513069,Unionstr.,7.448886,12,0.891383,6,True,20,1,2,0,0
2,2019-01-20 11:32:00,53006,51.509557,Ritterhausstr.,7.446949,2019-01-20 13:33:00,51.517155,Hauptbahnhof/Bahnhofsvorplatz,7.459931,121,1.235649,6,True,20,1,11,0,0
3,2019-01-20 14:38:00,53006,51.517155,Hauptbahnhof/Bahnhofsvorplatz,7.459931,2019-01-20 14:53:00,51.500725,Polizeipräsidium,7.459819,15,1.827997,6,True,20,1,14,0,0
4,2019-01-20 17:02:00,53006,51.500725,Polizeipräsidium,7.459819,2019-01-20 17:16:00,51.514029,Schwanenwall,7.472570,14,1.724677,6,True,20,1,17,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207471,2019-12-31 12:39:00,500019,51.500675,Kuithanstr.,7.440834,2019-12-31 12:54:00,51.517155,Hauptbahnhof/Bahnhofsvorplatz,7.459931,15,2.262708,1,False,31,12,12,0,0
207472,2019-12-31 19:28:00,500019,51.517155,Hauptbahnhof/Bahnhofsvorplatz,7.459931,2019-12-31 19:35:00,51.513069,Unionstr.,7.448886,7,0.891383,1,False,31,12,19,0,0
207473,2019-12-31 12:36:00,51287,51.482359,Barop Parkhaus,7.432326,2019-12-31 15:14:00,51.490505,An der Palmweide,7.438352,158,0.998294,1,False,31,12,12,0,0
207474,2019-12-31 22:37:00,500113,51.510976,Stadtgarten,7.464534,2019-12-31 23:05:00,51.486747,Am Beilstück,7.435750,28,3.355884,1,False,31,12,22,0,0
