<a href="https://colab.research.google.com/github/gabrielborja/python_data_analysis/blob/main/bysykkel_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bysyykel Oslo

## Uploading packages and data

In [1]:
#Importing necessary packages
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#Remove previous versions of the uploaded excel file
!rm 2021_04_bysykkel.csv

In [3]:
#Uploading file from local drive
from google.colab import files
uploaded = files.upload()

Saving 2021_04_bysykkel.csv to 2021_04_bysykkel.csv


In [4]:
#Storing dataset in a Pandas Dataframe
import io
by_df = pd.read_csv(io.BytesIO(uploaded['2021_04_bysykkel.csv']), parse_dates=True, infer_datetime_format=True)

In [5]:
#Checking the dataframe information
by_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116864 entries, 0 to 116863
Data columns (total 13 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   started_at                 116864 non-null  object 
 1   ended_at                   116864 non-null  object 
 2   duration                   116864 non-null  int64  
 3   start_station_id           116864 non-null  int64  
 4   start_station_name         116864 non-null  object 
 5   start_station_description  116858 non-null  object 
 6   start_station_latitude     116864 non-null  float64
 7   start_station_longitude    116864 non-null  float64
 8   end_station_id             116864 non-null  int64  
 9   end_station_name           116864 non-null  object 
 10  end_station_description    116858 non-null  object 
 11  end_station_latitude       116864 non-null  float64
 12  end_station_longitude      116864 non-null  float64
dtypes: float64(4), int64(3), obje

##Data cleaning and manipulation

In [7]:
#Convert columns into datetime object and Oslo timezone
by_df['started_at'] = pd.to_datetime(by_df['started_at'], format='%Y-%m-%d %H:%M:%S').dt.tz_convert('Europe/Oslo')
by_df['ended_at'] = pd.to_datetime(by_df['ended_at'], format='%Y-%m-%d %H:%M:%S').dt.tz_convert('Europe/Oslo')
by_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116864 entries, 0 to 116863
Data columns (total 13 columns):
 #   Column                     Non-Null Count   Dtype                      
---  ------                     --------------   -----                      
 0   started_at                 116864 non-null  datetime64[ns, Europe/Oslo]
 1   ended_at                   116864 non-null  datetime64[ns, Europe/Oslo]
 2   duration                   116864 non-null  int64                      
 3   start_station_id           116864 non-null  int64                      
 4   start_station_name         116864 non-null  object                     
 5   start_station_description  116858 non-null  object                     
 6   start_station_latitude     116864 non-null  float64                    
 7   start_station_longitude    116864 non-null  float64                    
 8   end_station_id             116864 non-null  int64                      
 9   end_station_name           116864 non

In [8]:
#Checking the dataframe head
by_df.head(2)

Unnamed: 0,started_at,ended_at,duration,start_station_id,start_station_name,start_station_description,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_description,end_station_latitude,end_station_longitude
0,2021-04-01 05:14:29.999000+02:00,2021-04-01 05:24:46.822000+02:00,616,407,Sagene bussholdeplass,langs Kierschovs gate,59.937743,10.751648,527,Biskop Gunnerus' gate,ved Oslo City,59.912334,10.752292
1,2021-04-01 05:38:10.860000+02:00,2021-04-01 05:49:05.972000+02:00,655,744,Hallénparken,ved Vogts gate,59.93153,10.762169,522,Mandalls gate,ved Grønlandsleiret,59.912347,10.763815


In [12]:
#Creating 2 dataframes, one with starting trips and one with end trips
st_df = by_df[['started_at', 'start_station_id', 'start_station_name', 'start_station_latitude', 'start_station_longitude', 'duration']]
en_df = by_df[['ended_at', 'end_station_id', 'end_station_name', 'end_station_latitude', 'end_station_longitude', 'duration']]

In [32]:
#Creating a function to extract categories from datetime object
def convert_dates(df):
  """Extract year, month, week number, day, hour and minutes from datetime column"""
  df = df.assign(year = df.iloc[:,0].dt.year,
                 month = df.iloc[:,0].dt.month_name(),
                 weeknum = df.iloc[:,0].dt.isocalendar().week,
                 day = df.iloc[:,0].dt.day_name(),
                 hour = df.iloc[:,0].dt.hour,
                 minute = df.iloc[:,0].dt.minute
                 )
  return df

In [33]:
#Converting dates for starting journeys dataframe
st_df = convert_dates(st_df)
st_df.head(2)

Unnamed: 0,started_at,start_station_id,start_station_name,start_station_latitude,start_station_longitude,duration,year,month,weeknum,day,hour,minute
0,2021-04-01 05:14:29.999000+02:00,407,Sagene bussholdeplass,59.937743,10.751648,616,2021,April,13,Thursday,5,14
1,2021-04-01 05:38:10.860000+02:00,744,Hallénparken,59.93153,10.762169,655,2021,April,13,Thursday,5,38


In [34]:
#Converting dates for end journeys dataframe
en_df = convert_dates(en_df)
en_df.head(2)

Unnamed: 0,ended_at,end_station_id,end_station_name,end_station_latitude,end_station_longitude,duration,year,month,weeknum,day,hour,minute
0,2021-04-01 05:24:46.822000+02:00,527,Biskop Gunnerus' gate,59.912334,10.752292,616,2021,April,13,Thursday,5,24
1,2021-04-01 05:49:05.972000+02:00,522,Mandalls gate,59.912347,10.763815,655,2021,April,13,Thursday,5,49
