# Semester Project - Nextbike
## Task 1 - Exploration and Description

In [None]:
# import relevant libaries for data exploration 
from vincenty import vincenty
import numpy as np 
import pandas as pd 
import datetime
from datetime import timedelta


import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# reading the csv
df = pd.read_csv("../data/internal/dortmund.csv", index_col=0)
df.head(5)

### What does the columns represent? 

The prefix "p" stands for the <i> positon </i> and the prefix "b" describes the features for the used <i> bike</i> . 

###### Meanings of the columns

| Column      | Description          |
|-------------|----------------------|
|<i> p_spot </i>      |True, if it is an official station                   |
|<i>p_place_type </i>|                      |
|<i>datetime </i></i>    |Datetime of the start or end of a trip |
|<i>b_number </i>    |Bike ID                   |
|<i>trip   </i>      |Values = ["first, last, start, end] <br> defines if a trip starts or ends|
|<i>p_uid </i>       |ID of the bike station / position                      |
|<i>p_bikes </i>     |Number of available bikes at the postion                      |
|<i>p_lat   </i>     |Latitude coordinate of the position                      |
|<i>b_bike_type</i>  |Type of the used bike                      |
|<i>p_name  </i>     |Street or station name of the current position                      |
|<i>p_number  </i>   |ID of the postion / bike station                      |
|<i>p_lng </i>       |Longitude coordinate of the position                      |
|<i>p_bike   </i>    |                      |



### Analyse the trip column

In [None]:
df["trip"].unique()

There are four different values in the trip column [first, last, start, end]. 
At least two values are required to define whether the data set belongs to the starting point or the end of the trip. This means that one trip is represented in two rows in the dataframe. One of the rows contains the values at the startinging point (i.e. datetime, start position) and the other row contains the values at the ending point of the trip. 

###### Why are there four values in the trip column? 

Let's have a deeper look in the dataframe and the trip column.

In [None]:
# there are much more datasets which have the values "start" and "end" in the trip column
df["trip"].value_counts()

In [None]:
df[(df["trip"] == "first") | (df["trip"] =="last")].head(50)

In this filtered dataframe above it gets clear that the examples with the values **first** and **last** in the trip column don't make much sense. Most of the trips in this dataframe have an unlikely long trip duration. The start time of a trip is almost always at 0 AM and the end time of a trip is at 23:59 PM. 
Furthermore the start and the end positions of one trip are the same. 

It could be measurement errors or other data recording errors. <br> 
These datasets can be disregarded for the next steps, because they aren't suitable for further analyses. 

In [None]:
# focus on datasets whit values "start" and "end" in the trip column
# store the starting and ending events of a trip in two different dataframes
# p_number != 0 --> just focus on the trips to an official bike station 
df_start = df[(df["trip"] == "start") & (df["p_number"] != 0)]
df_end = df[(df["trip"] == "end") & (df["p_number"] != 0)]

In [None]:
df_start.reset_index(inplace=True)
df_end.reset_index(inplace=True)

In [None]:
# rename the column names to distinguish the columns after a merge of the dateframes
df_start.rename(columns={"index":"index_start","datetime":"datetime_start", "p_lat":"latitude_start","p_lng":"longitude_start","p_name":"p_name_start","b_number":"b_number_start"},inplace=True)
df_end.rename(columns={"index":"index_end","datetime":"datetime_end", "p_lat":"latitude_end","p_lng":"longitude_end","p_name":"p_name_end","b_number":"b_number_end"},inplace=True)

In [None]:
# drop the columns, which aren't necessary for the final dateframe
df_start.drop(['p_spot', 'p_place_type',  'trip',
       'p_uid', 'p_bikes', 'b_bike_type',
       'p_number', 'p_bike'],inplace=True,axis=1)

df_end.drop(['p_spot', 'p_place_type', 'trip',
       'p_uid', 'p_bikes', 'b_bike_type',
       'p_number', 'p_bike'],inplace=True,axis=1)

In [None]:
# modify the index_end to merge the dataframes by index_start and index_end
df_end["index_end"] = df_end["index_end"]-1

In [None]:
# merge the two sepearte dataframes to the final dataframe 
# the final dataframe consists of datasets which describe a trip with features for the start and the end of a trip
df_final = pd.merge(df_start,df_end,left_on="index_start", right_on="index_end")

In [None]:
# check if there is a trip with different bike numbers at the start and the end of the trip 
#- if so this wouldn't make sense 
df_final[df_final["b_number_start"] != df_final["b_number_start"]]

In [None]:
# drop the redundant columns
df_final.drop(["index_start","index_end","b_number_end"],inplace=True,axis=1)
df_final.rename({"b_number_start":"b_number"},inplace=True)

In [None]:
# check for missing values 
df_final.isna().any(axis=0)

In [None]:
# converting objects to datetimes
df_final["datetime_start"] = pd.to_datetime(df_final["datetime_start"])
df_final["datetime_end"] = pd.to_datetime(df_final["datetime_end"])

# adding the trip duration with the difference of start and end time
df_final["trip_duration"] = df_final["datetime_end"] -df_final["datetime_start"]

#converting timedelta to numeric and format in minutes 
df_final["trip_duration"] = pd.to_numeric(df_final["trip_duration"] / 60000000000)

# adding the distance between start and end position
df_final["distance"] = df_final.apply(
    lambda x: vincenty([x["latitude_start"], x["longitude_start"]],
                       [x["latitude_end"], x["longitude_end"]],),axis=1)

## adding the weekday of the start time of a trip; stored in integers (0: monday, 6:sunday)
df_final['weekday'] = df_final['datetime_start'].dt.dayofweek

In [None]:
# function which returns True for saturday and sunday; otherwise it returns False
def isWeekend(index_of_day): 
    if index_of_day > 4: 
        return True 
    else: 
        return False

# adding new boolean column "weekend"    
df_final["weekend"] = df_final["weekday"].apply(lambda x: isWeekend(x))

In [None]:
# transform column "datatime_start" into several columns 
df_final["day"] = df_final["datetime_start"].apply(lambda x: x.day)
df_final["month"] = df_final["datetime_start"].apply(lambda x: x.month)
df_final["hour"] = df_final["datetime_start"].apply(lambda x: x.hour)

In [None]:
df_final.to_csv('../data/processed/dortmund_trips.csv')
df_final

### Calculating aggregate statistic per month, per day of week and per hour of day

##### Statistic per month

In [None]:
df_final.groupby(['month']).mean()[["trip_duration"]]
# --> no data for July 

In [None]:
# in this array "July" is missing 
month_by_name = np.array(["January", "February", "March", "April", "May", "June", "August", "September", "October", "November", "December"])

# Means 
df_final.groupby(['month']).mean()[["trip_duration"]].set_index(keys=month_by_name)

In [None]:
# Standard deviation 
df_final.groupby(['month']).std()[["trip_duration"]].set_index(keys=month_by_name)

#### Statistics per day of week

In [None]:
# Means 
weekday_by_name= np.array(["Monday", "Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"])
df_final.groupby(['weekday']).mean()[["trip_duration"]].set_index(weekday_by_name)

In [None]:
# Standard deviation 
df_final[["weekday", "trip_duration"]].groupby("weekday").std().set_index(weekday_by_name)

#### Statistics per hour of day

In [None]:
# Means 
df_final.groupby(['hour']).mean()[["trip_duration"]]

In [None]:
# Standard deviation 
df_final[["hour", "trip_duration"]].groupby("hour").std()

In [None]:
df_final.head()

### Visualization of the trip distribution

#### Trip Duration

In [None]:
trips = df_final
sns.barplot(x="month", y="trip_duration", data=trips, estimator=np.mean)
plt.show()

In [None]:
sns.violinplot(x="month", y="trip_duration", data=trips)
plt.show()

In [None]:
sns.barplot(x="hour", y="trip_duration", data=trips, estimator=np.mean)
plt.show()
# 0 bis 2 Uhr?

In [None]:
sns.barplot(x="weekend", y="trip_duration", data=trips, estimator=np.mean)
plt.show()

#### Trip Distance

In [None]:
sns.barplot(x="month", y="distance", data=trips, estimator=np.mean)
plt.show()

In [None]:
sns.barplot(x="weekday", y="distance", data=trips, estimator=np.mean)
plt.show()

In [None]:
sns.barplot(x="weekend", y="distance", data=trips, estimator=np.mean)
plt.show()

#### Amount of trips

In [None]:
sns.countplot(x="month", data=trips)
plt.show()
# take out january?

In [None]:
sns.countplot(x="weekend", data=trips)
plt.show()

In [None]:
sns.barplot(x="weekend", y="trip_duration", data=trips, estimator=np.mean)
plt.show()

In [None]:
df_final.mean()[["distance"]]
# --> no data for July 

In [None]:
df_final.groupby(['month']).mean()[["distance"]]
# --> no data for July 