# Semester Project - Nextbike
## Task 2 - Visualization

### a) For the summer month (i.e., June, August, or September) with most trips, visualize the number of started trips per PLZ region (you’ll have to find geo data for that yourselves!) in a map.

In [None]:
from nextbike.visualization import Visualization
from nextbike.preprocessing import Preprocessing
from nextbike.io import input
from nextbike.constants import CONSTANTS

import geopandas as gpd
import pandas as pd

#### Get the geodata of Dortmund

source of geodata: https://www.suche-postleitzahl.org/plz-karte-erstellens

In [None]:
# get geodata of germany (postal codes and their areas/polygons)
districts_germany = input.__read_geojson(geojson="germany_postalcodes.geojson")
districts_germany.head()

In [None]:
df = Preprocessing.__prep_geo_data(districts_germany)

#### Get the trip data (from Task 1)

In [None]:
df_trip = input.read_csv(loc= "processed", name= "dortmund_trips.csv")

##### Convert start positions of trip data to geographical points

In [None]:
# Go through every row, and make a point out of its lat and lon
df_trip["geometry"] = df_trip.apply(Preprocessing.__make_point, axis=1)
# It doesn't come with a CRS because it's a CSV, so it has to be set
df_trip.crs = {'init': 'epsg:4326'}
# additional column for aggregations
df_trip["count"] = 1

In [None]:
df_trip.head()

In [None]:
#convert dataset of trips to geodataframe (so it can be merged later with the geodataframe of dortmund)
geo_trips = gpd.GeoDataFrame(df_trip, crs={'init': 'epsg:4326'}, geometry=df_trip.geometry)

#### Calculate amount of trips per station

##### Group the data per station

In [None]:
trips_per_station = df_trip.groupby(["latitude_start", "longitude_start", "p_name_start"]).count()[["count"]]

In [None]:
trips_per_station

Some stations are listed multiple times, so we have to fix this

In [None]:
trips_per_station.reset_index(inplace=True)

In [None]:
# get the station name with its coordinates and the amount of trips at the station
trips_per_station = trips_per_station.groupby("p_name_start").agg({'count':'sum', 'latitude_start':'mean', 'longitude_start':'mean'})

In [None]:
trips_per_station.reset_index(inplace=True)

In [None]:
trips_per_station

#### Visualize the stations and their amount of rentals

In [None]:
Visualization.show_station_map(trips_per_station)

This map shows each station in Dortmund as a red circle. The bigger the circle the higher is the demand. 
Click on the circles for more information.

#### Get the summer month with the most trips

In [None]:
# get the amount of rentals per month
trips_per_month = df_trip.groupby("month").count()

# get the month with the highest amount of trips/highest value
maxValue = trips_per_month['datetime_start'].idxmax()
 
print("Month number", maxValue, "has the highes value")

June has the most trips

In [None]:
#convert dataset of trips to geodataframe (so it can be merged with the geodataframe of dortmund)
geo_trips_june = geo_trips[geo_trips["month"] == 6]

# drop unnecessary attributes
geo_trips_june = geo_trips_june[['geometry', 'count']]

In [None]:
# join the data
# merges data when POINT of trips is within POLYGON of a dortmund district
trips_with_postalcode_june = gpd.sjoin(geo_trips_june, df, how='left', op='within')

In [None]:
trips_with_postalcode_june

This dataframe shows the trips with their geographical location and their postalcode of the start-station 

##### Calculate the amount of trips per postal code

In [None]:
trips_with_postalcode_june = trips_with_postalcode_june.drop(columns = ["index_right", "note", "qkm", "einwohner"])

In [None]:
trips_with_postalcode_june.head()

In [None]:
amount_per_postalcode = trips_with_postalcode_june.groupby("plz").count()[["count"]]

In [None]:
amount_per_postalcode

In [None]:
# transfer amount of data to the geojson of the district
district_with_count = df.merge(amount_per_postalcode, on='plz', how='left', indicator=True)

In [None]:
district_with_count

This dataframe shows the geo data/postalcodes of dortmund with the respective amount of rentals

Some values show left_only/NaN => no rentals were done in these districts / there are no stations

#### Visualize the amount of rentals per postalcode for the month June in Dortmund

In [None]:
Visualization.show_rental_for_june(district_with_count)

This map shows each district/postal code in Dortmund with its demand. The demand can be differentiated by color.
Hower over the districts for more information.

### Visualize daily heatmap 

### Get the postalcode for each trip

In [None]:
# merges data when POINT of trips is within POLYGON of a dortmund district
trips_with_postalcode = gpd.sjoin(geo_trips, df, how='left', op='within')

In [None]:
# drop unnecessary columns
trips_with_postalcode = trips_with_postalcode.drop(columns = ["datetime_start", "b_number", "p_name_start", "datetime_end", "latitude_end", "p_name_end", "longitude_end", "trip_duration", "distance", "weekday", "weekend", "day", "index_right", "note", "qkm", "einwohner"])
trips_with_postalcode.head()

In [None]:
time_data = Preprocessing.__get_time_delta(trips_with_postalcode)

In [None]:
Visualization.show_time_heatmap(df=time_data, df2=df)

This heatmap shows the distribution of the demand (per station) at different times of day
Use the slider to select a time period

### Visualize monthly heatmap

This is not done per station (like previously).
This is done per postalcode. However, we had to map the districts to their center, because otherwise it could not be visualized. So the heats run from the center of a district to another

In [None]:
data = Preprocessing.__get_month_data(trips_with_postalcode)

In [None]:
Visualization.show_heatmap_monthly_per_district(df= data, df2= df)

### b) For one moment in time, visualize the number of bikes at fixed stations meaningfully.

#### Create dataframe with bike numbers of every station

##### Create a column for every station in the dataset

In [None]:
df_station = Preprocessing.__prep_for_visualization()

#### Fill the dataframe by going through the eventlog

In [None]:
# get the amount of available bikes at each station for different time stamps
for index, row in df_station['df_raw'].iterrows():
    timestamp = pd.to_datetime(row.get(key = 'datetime')).ceil('H').strftime('%Y-%m-%d-%H')
    df_station['station_bikenumbers'].at[timestamp, row.get(key = 'p_name')] = pd.to_numeric(row.get(key = 'p_bikes'))

####  Fill NaN values with last available value

In [None]:
df_station['station_bikenumbers'].fillna(method='ffill', inplace=True)
df_station['station_bikenumbers'].fillna(value='0.0', inplace=True)
df_station['station_bikenumbers']

This matrix shows the amount of available bikes at each station at different times

#### Generate a station list with geo coordinates

In [None]:
stations = trips_per_station.drop(columns = ["count"], axis = 1)
stations.rename(columns={'p_name_start':'p_name', 'latitude_start':'latitude', 'longitude_start': 'longitude'}, inplace=True)
#stations.set_index('p_name')
stations

#### Visualization of one moment

In [None]:
data = pd.DataFrame(df_station['station_bikenumbers'].loc[CONSTANTS.FILTER_FOR_ONE_MOMENT.value])

data.reset_index(inplace=True)
data = data.rename(columns={'index': 'p_name', CONSTANTS.FILTER_FOR_ONE_MOMENT.value:'NumberOfBikes'})
data.head()

In [None]:
bikenumber_one_moment = pd.merge(data, stations, on = "p_name")
bikenumber_one_moment.head()

In [None]:
Visualization.show_one_moment(df= districts_germany,df2=bikenumber_one_moment)

In [None]:
Visualization.show_one_moment_at_map(df= districts_germany,df2=bikenumber_one_moment)

### c) Create a heatmap based on an interesting aspect of the data, e.g., end locations of trips shortly before the start of a major public event.

In [None]:
df_station = df_trip[["p_name_start","latitude_start", "longitude_start", "p_name_end", 
                      "latitude_end","longitude_end", "coordinates_start", "coordinates_end","datetime_start"]]

In [None]:
df_station.head()

In [None]:
AMOUNT_OF_TRIPS = 20000

In [None]:
Visualization.most_used_station(df_station, AMOUNT_OF_TRIPS)

In [None]:
Visualization.station_capacity(df_station)

In [None]:
Visualization.show_trips(df_station, AMOUNT_OF_TRIPS)

In [None]:
Visualization.show_map_at_specific_day(df_station, date="2019-01-20", street="Signal Iduna Park")

### d) Visualize the distribution of trip lengths per month. Compare the distributions to normal distributions with mean and standard deviation as calculated before (1.d))

#### Visualization of the trip distribution

##### Trip duration

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
weekday_names = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

# without July (no data for July)
month_names = ["January","February","March","April","May","June","August","September","October","November","December"]

In [None]:
fig_dims = (12, 8)
fig, ax = plt.subplots(figsize=fig_dims)

sns.barplot(x="month", y="trip_duration", data=df_trip, estimator=np.mean,ax=ax)
plt.xticks([0,1,2,3,4,5,6,7,8,9,10,11], month_names)
plt.title("Trip duration per month")
plt.show()

In [None]:
fig_dims = (12, 8)
fig, ax = plt.subplots(figsize=fig_dims)

sns.barplot(x="weekday", y="trip_duration", data=df_trip, estimator=np.mean,ax=ax)
plt.xticks([0,1,2,3,4,5,6], weekday_names)
plt.title("Trip duration per weekday")
plt.show()

##### Trip duration per hour

In [None]:
fig_dims = (12, 8)
fig, ax = plt.subplots(figsize=fig_dims)

sns.barplot(x="hour", y="trip_duration", data=df_trip, estimator=np.mean,ax=ax)
plt.title("Trip duration per hour")
plt.show()

There is a suprising drop from between 0:00 and 2:00

##### Trip duration on weekdays and weekends

In [None]:
fig_dims = (8, 6)
fig, ax = plt.subplots(figsize=fig_dims)

sns.barplot(x="weekend", y="trip_duration", data=df_trip, estimator=np.mean)
plt.title("Trip duration on weekdays and weekends")
plt.show()

##### Trip distance per month

In [None]:
fig_dims = (12, 8)
fig, ax = plt.subplots(figsize=fig_dims)

sns.barplot(x="month", y="distance", data=df_trip, estimator=np.mean, ax=ax)
plt.xticks([0,1,2,3,4,5,6,7,8,9,10,11], month_names)
plt.title("Trip distance per month")
plt.show()

##### Trip distance per weekday

In [None]:
fig_dims = (10, 6)
fig, ax = plt.subplots(figsize=fig_dims)

sns.barplot(x="weekday", y="distance", data=df_trip, estimator=np.mean,ax=ax)
plt.title("Trip distance per day")
plt.xticks([0,1,2,3,4,5,6], weekday_names)
plt.show()

##### Trip distance per hour

In [None]:
fig_dims = (10, 6)
fig, ax = plt.subplots(figsize=fig_dims)

sns.barplot(x="hour", y="distance", data=df_trip, estimator=np.mean,ax=ax)
plt.title("Trip distance per hour")
plt.show()

##### Trip distance on weekdays and weekends

In [None]:
fig_dims = (10, 6)
fig, ax = plt.subplots(figsize=fig_dims)

sns.barplot(x="weekend", y="distance", data=df_trip, estimator=np.mean,ax=ax)
plt.title("Trip distance on weekdays and weekends")
plt.show()

##### Amount of trips per month

In [None]:
fig_dims = (12, 8)
fig, ax = plt.subplots(figsize=fig_dims)

sns.countplot(x="month", data=df_trip, ax=ax)
plt.xticks([0,1,2,3,4,5,6,7,8,9,10,11], month_names)
plt.title("Amount of trips per month")
plt.show()

Data recording began on 20th of January. That's the reason for the low amount of trips in January.

##### Amount of trips per weekday

In [None]:
fig_dims = (12, 6)
fig, ax = plt.subplots(figsize=fig_dims)

sns.countplot(x="weekday", data=df_trip)
plt.title("Amount of trips per weekday")
plt.xticks([0,1,2,3,4,5,6], weekday_names)
plt.show()

##### Amount of trips per hour

In [None]:
fig_dims = (12, 6)
fig, ax = plt.subplots(figsize=fig_dims)

sns.countplot(x="hour", data=df_trip)
plt.title("Amount of trips per hour")
plt.show()