<a href="https://colab.research.google.com/github/gabrielborja/python_data_analysis/blob/main/bysykkel_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bysyykel Oslo

## Uploading packages and data

In [1]:
#Importing necessary packages
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
#import seaborn as sns

In [None]:
#Remove previous versions of the uploaded excel file
!rm 2021_04_bysykkel.csv

In [None]:
#Uploading file from local drive
from google.colab import files
uploaded = files.upload()

Saving 2021_04_bysykkel.csv to 2021_04_bysykkel.csv


In [None]:
#Storing dataset in a Pandas Dataframe
import io
by_df = pd.read_csv(io.BytesIO(uploaded['2021_04_bysykkel.csv']), parse_dates=True, infer_datetime_format=True)

In [None]:
#Checking the dataframe information
by_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116864 entries, 0 to 116863
Data columns (total 13 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   started_at                 116864 non-null  object 
 1   ended_at                   116864 non-null  object 
 2   duration                   116864 non-null  int64  
 3   start_station_id           116864 non-null  int64  
 4   start_station_name         116864 non-null  object 
 5   start_station_description  116858 non-null  object 
 6   start_station_latitude     116864 non-null  float64
 7   start_station_longitude    116864 non-null  float64
 8   end_station_id             116864 non-null  int64  
 9   end_station_name           116864 non-null  object 
 10  end_station_description    116858 non-null  object 
 11  end_station_latitude       116864 non-null  float64
 12  end_station_longitude      116864 non-null  float64
dtypes: float64(4), int64(3), obje

##Data cleaning and manipulation

In [None]:
#Convert columns into datetime object and Oslo timezone
by_df['started_at'] = pd.to_datetime(by_df['started_at'], format='%Y-%m-%d %H:%M:%S').dt.tz_convert('Europe/Oslo')
by_df['ended_at'] = pd.to_datetime(by_df['ended_at'], format='%Y-%m-%d %H:%M:%S').dt.tz_convert('Europe/Oslo')
by_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116864 entries, 0 to 116863
Data columns (total 13 columns):
 #   Column                     Non-Null Count   Dtype                      
---  ------                     --------------   -----                      
 0   started_at                 116864 non-null  datetime64[ns, Europe/Oslo]
 1   ended_at                   116864 non-null  datetime64[ns, Europe/Oslo]
 2   duration                   116864 non-null  int64                      
 3   start_station_id           116864 non-null  int64                      
 4   start_station_name         116864 non-null  object                     
 5   start_station_description  116858 non-null  object                     
 6   start_station_latitude     116864 non-null  float64                    
 7   start_station_longitude    116864 non-null  float64                    
 8   end_station_id             116864 non-null  int64                      
 9   end_station_name           116864 non

In [None]:
#Checking the dataframe head
by_df.head(2)

Unnamed: 0,started_at,ended_at,duration,start_station_id,start_station_name,start_station_description,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_description,end_station_latitude,end_station_longitude
0,2021-04-01 05:14:29.999000+02:00,2021-04-01 05:24:46.822000+02:00,616,407,Sagene bussholdeplass,langs Kierschovs gate,59.937743,10.751648,527,Biskop Gunnerus' gate,ved Oslo City,59.912334,10.752292
1,2021-04-01 05:38:10.860000+02:00,2021-04-01 05:49:05.972000+02:00,655,744,Hallénparken,ved Vogts gate,59.93153,10.762169,522,Mandalls gate,ved Grønlandsleiret,59.912347,10.763815


In [None]:
#Creating 2 dataframes, one with starting trips and one with end trips
st_df = by_df[['started_at', 'start_station_id', 'start_station_name', 'start_station_latitude', 'start_station_longitude', 'duration']]
en_df = by_df[['ended_at', 'end_station_id', 'end_station_name', 'end_station_latitude', 'end_station_longitude', 'duration']]

In [None]:
#Creating a function to extract categories from datetime object
def convert_dates(df):
  """Extract year, month, week number, day, hour and minutes from datetime column"""
  df = df.assign(year = df.iloc[:,0].dt.year,
                 month = df.iloc[:,0].dt.month_name(),
                 weeknum = df.iloc[:,0].dt.isocalendar().week,
                 day = df.iloc[:,0].dt.day_name(),
                 hour = df.iloc[:,0].dt.hour,
                 minute = df.iloc[:,0].dt.minute
                 )
  return df

In [None]:
#Converting dates for starting journeys dataframe
st_df = convert_dates(st_df)
st_df.head(2)

Unnamed: 0,started_at,start_station_id,start_station_name,start_station_latitude,start_station_longitude,duration,year,month,weeknum,day,hour,minute
0,2021-04-01 05:14:29.999000+02:00,407,Sagene bussholdeplass,59.937743,10.751648,616,2021,April,13,Thursday,5,14
1,2021-04-01 05:38:10.860000+02:00,744,Hallénparken,59.93153,10.762169,655,2021,April,13,Thursday,5,38


In [None]:
#Converting dates for end journeys dataframe
en_df = convert_dates(en_df)
en_df.head(2)

Unnamed: 0,ended_at,end_station_id,end_station_name,end_station_latitude,end_station_longitude,duration,year,month,weeknum,day,hour,minute
0,2021-04-01 05:24:46.822000+02:00,527,Biskop Gunnerus' gate,59.912334,10.752292,616,2021,April,13,Thursday,5,24
1,2021-04-01 05:49:05.972000+02:00,522,Mandalls gate,59.912347,10.763815,655,2021,April,13,Thursday,5,49


In [None]:
#Start location count
st_loc = st_df.groupby(['start_station_name', 'start_station_latitude', 'start_station_longitude']).agg({'duration': 'count'}).reset_index()
st_loc = st_loc.rename(columns={'duration': 'trips'}).sort_values(by='trips', ascending=False).reset_index(drop=True)
st_loc = st_loc.rename(columns={'start_station_name':'name', 'start_station_latitude':'lat', 'start_station_longitude':'lon'})
st_loc.head(10)

Unnamed: 0,start_station_name,start_station_latitude,start_station_longitude,trips
0,Ringnes Park,59.928434,10.75943,1570
1,Alexander Kiellands Plass,59.928067,10.751203,1522
2,Majorstuen,59.929045,10.716926,1289
3,Bislettgata,59.923774,10.734713,1242
4,Rådhusbrygge 4,59.910847,10.730377,1189
5,Bislett Stadion,59.925471,10.731219,1185
6,Tjuvholmen,59.909467,10.722509,1149
7,Olaf Ryes plass,59.922425,10.758182,1100
8,Storo Storsenter,59.94671,10.773805,1094
9,Marcus Thranes gate,59.932772,10.758595,1082


In [None]:
#End location count
en_loc = en_df.groupby(['end_station_name', 'end_station_latitude', 'end_station_longitude']).agg({'duration': 'count'}).reset_index()
en_loc = en_loc.rename(columns={'duration': 'trips'}).sort_values(by='trips', ascending=False).reset_index(drop=True)
en_loc = en_loc.rename(columns={'end_station_name':'name', 'end_station_latitude':'lat', 'end_station_longitude':'lon'})
en_loc.head(10)

Unnamed: 0,end_station_name,end_station_latitude,end_station_longitude,trips
0,Rådhusbrygge 4,59.910847,10.730377,2007
1,Tjuvholmen,59.909467,10.722509,1645
2,Olaf Ryes plass,59.922425,10.758182,1577
3,Ringnes Park,59.928434,10.75943,1571
4,Alexander Kiellands Plass,59.928067,10.751203,1543
5,Bjørvika,59.909006,10.75618,1392
6,Sjøsiden ved trappen,59.910154,10.751981,1322
7,Helga Helgesens plass,59.912111,10.766194,1267
8,Majorstuen,59.929045,10.716926,1185
9,Jernbanetorget,59.911901,10.749929,1148


In [None]:
#Exporting to csv into local disk
from google.colab import files
st_loc.to_csv('2021_04_st_loc.csv', index=False) #==> Excluding index from file
files.download('2021_04_st_loc.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#Exporting to excel into local disk
from google.colab import files
en_loc.to_csv('2021_04_en_loc.csv', index=False) #==> Excluding index from file
files.download('2021_04_en_loc.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

##Data Visualization

In [None]:
#Remove previous versions of the uploaded csv file
!rm 2021_04_st_loc.csv

In [3]:
#Uploading file from local drive
from google.colab import files
uploaded1 = files.upload()

Saving 2021_04_st_loc.csv to 2021_04_st_loc.csv


In [6]:
#Reading aggregated data and storing into a Pandas Dataframe
import io
st_loc = pd.read_csv(io.BytesIO(uploaded1['2021_04_st_loc.csv']))

In [7]:
#Drop name columns
st_loc = st_loc.drop(columns='name')
st_loc.head()

Unnamed: 0,lat,lon,trips
0,59.928434,10.75943,1570
1,59.928067,10.751203,1522
2,59.929045,10.716926,1289
3,59.923774,10.734713,1242
4,59.910847,10.730377,1189


In [10]:
#Create a function to convert lat, lon, and weight to list. Folium does not take DataFrames
def convert_to_list(df):
  """Convert dataframe to list of lat, lon and weight"""
  lat = df['lat'].to_numpy(dtype=float)
  lon = df['lon'].to_numpy(dtype=float)
  weight = df['trips'].to_numpy(dtype=float)
  return [[l, g, w] for l, g, w in zip(lat, lon, weight)]

In [11]:
#Convert lat, long and weight to list
heat_data = convert_to_list(st_loc)
heat_data[:5]

[[59.92843404417578, 10.75943014633117, 1570.0],
 [59.928066706156855, 10.751202636819613, 1522.0],
 [59.929045, 10.716926, 1289.0],
 [59.92377440938114, 10.734712874573999, 1242.0],
 [59.9108469, 10.7303766, 1189.0]]

In [12]:
#Uploading necessary packages
import folium
from folium import plugins
from folium.plugins import HeatMap

In [13]:
#Creating a function to generate a basemap
def generate_basemap(default_loc=[59.91,10.75], default_zoom=12):
    bmap = folium.Map(location=default_loc, zoom_start=default_zoom)
    return bmap

In [14]:
#Generating a basemap
basemap = generate_basemap()

In [15]:
#Creating a HeatMap with lat, lon and number of trips
HeatMap(heat_data).add_to(basemap)

<folium.plugins.heat_map.HeatMap at 0x7fa9ef5a0e10>

In [16]:
#Plotting basemap for starting point of trips
basemap