In [1]:
#Import package pandas for data analysis
import pandas as pd

# Import package numpy for numeric computing
import numpy as np

# Import package matplotlib for visualisation/plotting
import matplotlib.pyplot as plt

In [2]:
dynamic = pd.read_csv('db_backup/07.04/BikeData_DynamicData.csv', names=['StationNumber' , 'StationName', 'AvailableBikes' , 'AvailableBikeStands', 'Date', 'Time'])
static = pd.read_csv('db_backup/07.04/BikeData_StaticData.csv', names=['StationNumber', 'StationName', 'Address', 'Latitude', 'Longitude', 'Banking'])
weather = pd.read_csv('db_backup/07.04/BikeData_WeatherData.csv', names=['Date', 'Time', 'Rainfall', 'Temperature', 'Icon', 'WindSpeed'])

# Static Data

In [3]:
#Displaying the first few and last rows of the static table
static.head()

Unnamed: 0,StationNumber,StationName,Address,Latitude,Longitude,Banking
0,42,SMITHFIELD NORTH,Smithfield North,53.349562,-6.278198,1
1,30,PARNELL SQUARE NORTH,Parnell Square North,53.353462,-6.265305,1
2,54,CLONMEL STREET,Clonmel Street,53.336021,-6.26298,0
3,108,AVONDALE ROAD,Avondale Road,53.359405,-6.276142,0
4,56,MOUNT STREET LOWER,Mount Street Lower,53.33796,-6.24153,0


In [4]:
static.shape

(110, 6)

- There are 110 stations.

In [5]:
# Number of empty cells for each column
static.isnull().sum()

StationNumber    0
StationName      0
Address          0
Latitude         0
Longitude        0
Banking          0
dtype: int64

In [6]:
# Creating an array with the station names
station_numbers = static["StationNumber"].to_numpy()
station_numbers.sort()

In [7]:
#Creating a DataFram only with the station_numbers
avg_station=pd.DataFrame(station_numbers, dtype='category', columns=['StationNumber'])
avg_station

Unnamed: 0,StationNumber
0,2
1,3
2,4
3,5
4,6
...,...
105,113
106,114
107,115
108,116


In [8]:
type(static)

pandas.core.frame.DataFrame

In [9]:
type(avg_station)

pandas.core.frame.DataFrame

# Weather

In [10]:
#Displaying the first few and last rows of the dynamic table
weather

Unnamed: 0,Date,Time,Rainfall,Temperature,Icon,WindSpeed
0,2020-02-21,13:00:05,0.0,10.1,,
1,2020-02-21,13:30:02,0.0,10.2,,
2,2020-02-21,14:00:02,0.0,10.2,,
3,2020-02-22,14:30:01,0.0,7.4,,
4,2020-02-22,15:00:02,0.0,7.4,,
...,...,...,...,...,...,...
2063,2020-04-07,07:30:02,0.0,8.0,partly-cloudy-day,10.86
2064,2020-04-07,08:00:02,0.0,8.2,partly-cloudy-day,12.09
2065,2020-04-07,08:30:02,0.0,8.4,partly-cloudy-day,13.23
2066,2020-04-07,09:00:02,0.0,8.7,partly-cloudy-day,14.26


In [11]:
# Number of empty cells for each column
weather.isnull().sum()

Date             0
Time             0
Rainfall         0
Temperature      0
Icon           387
WindSpeed      388
dtype: int64

# Dynamic Data

In [12]:
# Converting StationNumber to object so it does not appear during descriptive stats
dynamic['StationNumber'] = dynamic['StationNumber'].astype('object')

In [13]:
# Converting Date to datetime 
dynamic['Date'] = dynamic['Date'].astype('datetime64')

In [14]:
#Displaying the first few and last rows of the dynamic table
dynamic

Unnamed: 0,StationNumber,StationName,AvailableBikes,AvailableBikeStands,Date,Time
0,42,SMITHFIELD NORTH,2,28,2020-02-21,12:52:58
1,30,PARNELL SQUARE NORTH,1,19,2020-02-21,12:54:29
2,54,CLONMEL STREET,25,8,2020-02-21,12:54:39
3,108,AVONDALE ROAD,4,36,2020-02-21,12:50:23
4,56,MOUNT STREET LOWER,27,13,2020-02-21,12:47:41
...,...,...,...,...,...,...
1343418,39,WILTON TERRACE,5,15,2020-04-07,09:34:05
1343419,83,EMMET ROAD,19,20,2020-04-07,09:30:33
1343420,92,HEUSTON BRIDGE (NORTH),19,20,2020-04-07,09:30:36
1343421,21,LEINSTER STREET SOUTH,15,15,2020-04-07,09:32:02


In [15]:
print('Number of duplicate (excluding first) rows in the table is: ', dynamic.duplicated().sum())

Number of duplicate (excluding first) rows in the table is:  575827


### Checking for duplicate features

In [16]:
dupl = dynamic.duplicated()
new = dynamic.loc[(dupl == True)]

new.head(200)

Unnamed: 0,StationNumber,StationName,AvailableBikes,AvailableBikeStands,Date,Time
110,30,PARNELL SQUARE NORTH,1,19,2020-02-21,12:54:29
111,54,CLONMEL STREET,25,8,2020-02-21,12:54:39
112,108,AVONDALE ROAD,4,36,2020-02-21,12:50:23
119,13,FITZWILLIAM SQUARE WEST,21,9,2020-02-21,12:53:25
120,43,PORTOBELLO ROAD,0,30,2020-02-21,12:50:26
...,...,...,...,...,...,...
791,63,FENIAN STREET,13,21,2020-02-21,13:21:23
792,113,MERRION SQUARE SOUTH,19,21,2020-02-21,13:22:39
803,73,FRANCIS STREET,0,30,2020-02-21,13:23:05
804,4,GREEK STREET,4,16,2020-02-21,13:20:37


In [17]:
dynamic.head(200)

Unnamed: 0,StationNumber,StationName,AvailableBikes,AvailableBikeStands,Date,Time
0,42,SMITHFIELD NORTH,2,28,2020-02-21,12:52:58
1,30,PARNELL SQUARE NORTH,1,19,2020-02-21,12:54:29
2,54,CLONMEL STREET,25,8,2020-02-21,12:54:39
3,108,AVONDALE ROAD,4,36,2020-02-21,12:50:23
4,56,MOUNT STREET LOWER,27,13,2020-02-21,12:47:41
...,...,...,...,...,...,...
195,10,DAME STREET,6,10,2020-02-21,12:59:42
196,100,HEUSTON BRIDGE (SOUTH),13,12,2020-02-21,12:58:07
197,24,CATHAL BRUGHA STREET,3,17,2020-02-21,12:59:36
198,64,SANDWITH STREET,20,19,2020-02-21,12:58:51


### Dropping duplicates

In [18]:
dynamic = dynamic.drop_duplicates()

In [19]:
dynamic

Unnamed: 0,StationNumber,StationName,AvailableBikes,AvailableBikeStands,Date,Time
0,42,SMITHFIELD NORTH,2,28,2020-02-21,12:52:58
1,30,PARNELL SQUARE NORTH,1,19,2020-02-21,12:54:29
2,54,CLONMEL STREET,25,8,2020-02-21,12:54:39
3,108,AVONDALE ROAD,4,36,2020-02-21,12:50:23
4,56,MOUNT STREET LOWER,27,13,2020-02-21,12:47:41
...,...,...,...,...,...,...
1343414,40,JERVIS STREET,3,18,2020-04-07,09:34:23
1343415,29,ORMOND QUAY UPPER,14,15,2020-04-07,09:37:12
1343416,103,GRANGEGORMAN LOWER (SOUTH),2,38,2020-04-07,09:38:12
1343417,28,MOUNTJOY SQUARE WEST,7,23,2020-04-07,09:35:51


In [20]:
print('Number of duplicate (excluding first) rows in the table is: ', dynamic.duplicated().sum())

Number of duplicate (excluding first) rows in the table is:  0


In [21]:
## Reseting the index count
dynamic = dynamic.reset_index(drop=True) 

In [22]:
# Number of empty cells for each column
dynamic.isnull().sum()

StationNumber          0
StationName            0
AvailableBikes         0
AvailableBikeStands    0
Date                   0
Time                   0
dtype: int64

In [23]:
dynamic.dtypes

StationNumber                  object
StationName                    object
AvailableBikes                  int64
AvailableBikeStands             int64
Date                   datetime64[ns]
Time                           object
dtype: object

In [24]:
# Displaying descriptive stats for numeric columns
numeric_cols = dynamic.select_dtypes(['int64']).columns
dynamic[numeric_cols].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
AvailableBikes,767596.0,11.820447,8.876169,0.0,5.0,11.0,17.0,40.0
AvailableBikeStands,767596.0,20.187855,10.415964,0.0,13.0,20.0,28.0,41.0


### Adding Total Bike Stands to the dataframe

### Adding DayOfWeek column to the dataframe


In [25]:
date = dynamic['Date']
day_of_week_arr=[]
for i in date:
    day_of_week_arr.append(i.dayofweek)
day_of_week=pd.Series(day_of_week_arr, dtype='category')
dynamic['DayOfWeek']=day_of_week

In [26]:
dynamic

Unnamed: 0,StationNumber,StationName,AvailableBikes,AvailableBikeStands,Date,Time,DayOfWeek
0,42,SMITHFIELD NORTH,2,28,2020-02-21,12:52:58,4
1,30,PARNELL SQUARE NORTH,1,19,2020-02-21,12:54:29,4
2,54,CLONMEL STREET,25,8,2020-02-21,12:54:39,4
3,108,AVONDALE ROAD,4,36,2020-02-21,12:50:23,4
4,56,MOUNT STREET LOWER,27,13,2020-02-21,12:47:41,4
...,...,...,...,...,...,...,...
767591,40,JERVIS STREET,3,18,2020-04-07,09:34:23,1
767592,29,ORMOND QUAY UPPER,14,15,2020-04-07,09:37:12,1
767593,103,GRANGEGORMAN LOWER (SOUTH),2,38,2020-04-07,09:38:12,1
767594,28,MOUNTJOY SQUARE WEST,7,23,2020-04-07,09:35:51,1


In [27]:
monday=dynamic.loc[(dynamic['DayOfWeek'] == 0)]
tuesday=dynamic.loc[(dynamic['DayOfWeek'] == 1)]
wednesday=dynamic.loc[(dynamic['DayOfWeek'] == 2)]
thursday=dynamic.loc[(dynamic['DayOfWeek'] == 3)]
friday=dynamic.loc[(dynamic['DayOfWeek'] == 4)]
saturday=dynamic.loc[(dynamic['DayOfWeek'] == 5)]
sunday=dynamic.loc[(dynamic['DayOfWeek'] == 6)]

days_of_week=[monday, tuesday, wednesday, thursday, friday, saturday, sunday]


In [28]:
week_days=['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
count=0
for i in days_of_week:
    
    sum_day_of_week=i.groupby('StationNumber',as_index=False).sum()
    count_day_of_week=i.groupby('StationNumber',as_index=False).count()
    
    sum_day_of_week=sum_day_of_week['AvailableBikes']
    count_day_of_week=count_day_of_week['AvailableBikes']
    avg_day_of_week=round(sum_day_of_week/count_day_of_week,0)
    
    avg_station['sum_'+ str(week_days[count])]=sum_day_of_week
    avg_station['count_'+ str(week_days[count])]=count_day_of_week
    avg_station['avg_'+ str(week_days[count])]=avg_day_of_week
    count+=1
    
avg_station.shape

(110, 22)

In [29]:
avg_station

Unnamed: 0,StationNumber,sum_monday,count_monday,avg_monday,sum_tuesday,count_tuesday,avg_tuesday,sum_wednesday,count_wednesday,avg_wednesday,...,avg_thursday,sum_friday,count_friday,avg_friday,sum_saturday,count_saturday,avg_saturday,sum_sunday,count_sunday,avg_sunday
0,2,6415,898,7.0,7858,934,8.0,9350,972,10.0,...,10.0,7491,996,8.0,5735,1011,6.0,6002,998,6.0
1,3,6421,939,7.0,6254,991,6.0,6907,1062,7.0,...,6.0,6710,1047,6.0,9483,1041,9.0,7345,1011,7.0
2,4,9753,916,11.0,7912,952,8.0,7349,1018,7.0,...,7.0,7760,1012,8.0,9027,973,9.0,11583,967,12.0
3,5,15215,1037,15.0,16929,1098,15.0,19261,1188,16.0,...,15.0,17403,1199,15.0,9832,1076,9.0,12107,1073,11.0
4,6,4254,908,5.0,2773,955,3.0,2813,1001,3.0,...,3.0,5602,1006,6.0,9873,1025,10.0,10735,1006,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,113,8354,877,10.0,10381,909,11.0,9396,953,10.0,...,10.0,9211,960,10.0,3254,939,3.0,3283,941,3.0
106,114,10333,931,11.0,10848,987,11.0,11983,1039,12.0,...,13.0,9365,1034,9.0,6204,980,6.0,7247,998,7.0
107,115,17999,961,19.0,16986,1004,17.0,17602,1071,16.0,...,16.0,14035,1083,13.0,19413,1082,18.0,22485,1063,21.0
108,116,3446,824,4.0,3509,756,5.0,3325,731,5.0,...,5.0,4930,886,6.0,4701,871,5.0,3875,880,4.0


In [None]:
dict=

In [30]:
#{2:[{Monday:2}, {Tuesday:3}, {Wednesday:3}, {Thursday:3}, {Friday:3}, {Saturday:3}, {Sunday:3}], 3....}