# Goal
### Clean and preprocess the collected data to remove errors, handle missing values, and transform it into a suitable format for analysis. 
**Tasks**: data cleaning, data integration, data transformation etc

### *2018 Data* 

In [4]:
import pandas as pd
from datetime import datetime
# read the 2018 baywheels data
baywheels2018_df = pd.read_csv('baywheels_2018', low_memory=False, index_col=0)

In [5]:
# convert the start_time and end_time object into datetime64 objects
baywheels2018_df['start_time'] = pd.to_datetime(baywheels2018_df['start_time'], 
                                                format="Date: %d/%m/%y; Time: %H:%M:%S", errors='coerce')
baywheels2018_df['end_time'] = pd.to_datetime(baywheels2018_df['end_time'], 
                                                format="Date: %d/%m/%y; Time: %H:%M:%S", errors='coerce')

In [6]:
baywheels2018_df.dtypes

start_time           datetime64[ns]
end_time             datetime64[ns]
start_station_id             object
start_station_lat            object
start_station_lon            object
end_station_id               object
end_station_lat              object
end_station_lon              object
bike_id                      object
dtype: object

In [7]:
baywheels2018_df.head(2)

Unnamed: 0,start_time,end_time,start_station_id,start_station_lat,start_station_lon,end_station_id,end_station_lat,end_station_lon,bike_id
0,2018-04-26 18:04:41,2018-04-26 18:10:07,11.0,37.79728,-122.398436,6.0,37.80477,-122.403234,2530.0
1,2018-05-18 19:34:12,2018-05-18 19:37:57,64.0,37.7767539,-122.3990176,321.0,37.7801457035,-122.4030708524,3643.0


In [8]:
#identify the NaT values
na_start_values = baywheels2018_df['start_time'].isna()
na_start_rows = baywheels2018_df[na_start_values]
na_end_values = baywheels2018_df['end_time'].isna()
na_end_rows = baywheels2018_df[na_end_values]
na_rows = pd.concat([na_start_rows, na_end_rows])
na_rows # the rows with missing values are the same for both columns (10 rows)

Unnamed: 0,start_time,end_time,start_station_id,start_station_lat,start_station_lon,end_station_id,end_station_lat,end_station_lon,bike_id
439433,NaT,NaT,Failure,Failure,Failure,Failure,Failure,Failure,Failure
481488,NaT,NaT,Failure,Failure,Failure,Failure,Failure,Failure,Failure
689379,NaT,NaT,Failure,Failure,Failure,Failure,Failure,Failure,Failure
896954,NaT,NaT,Failure,Failure,Failure,Failure,Failure,Failure,Failure
959289,NaT,NaT,Failure,Failure,Failure,Failure,Failure,Failure,Failure
1291280,NaT,NaT,Failure,Failure,Failure,Failure,Failure,Failure,Failure
1500621,NaT,NaT,Failure,Failure,Failure,Failure,Failure,Failure,Failure
1525798,NaT,NaT,Failure,Failure,Failure,Failure,Failure,Failure,Failure
1604523,NaT,NaT,Failure,Failure,Failure,Failure,Failure,Failure,Failure
1861419,NaT,NaT,Failure,Failure,Failure,Failure,Failure,Failure,Failure


In [9]:
# check the start_time column
baywheels2018_df['start_time']

0         2018-04-26 18:04:41
1         2018-05-18 19:34:12
2         2018-07-14 14:57:05
3         2018-11-11 16:35:22
4         2018-12-15 10:15:24
                  ...        
1863726   2018-05-09 08:23:45
1863727   2018-12-22 19:28:59
1863728   2018-05-05 10:49:33
1863729   2018-05-09 08:55:21
1863730   2018-02-08 18:01:18
Name: start_time, Length: 1863731, dtype: datetime64[ns]

In [10]:
# drop the NaT values
baywheels2018_df.dropna(subset=['start_time'], inplace=True)

In [11]:
# check the start_time column after dropna
baywheels2018_df['start_time']

0         2018-04-26 18:04:41
1         2018-05-18 19:34:12
2         2018-07-14 14:57:05
3         2018-11-11 16:35:22
4         2018-12-15 10:15:24
                  ...        
1863726   2018-05-09 08:23:45
1863727   2018-12-22 19:28:59
1863728   2018-05-05 10:49:33
1863729   2018-05-09 08:55:21
1863730   2018-02-08 18:01:18
Name: start_time, Length: 1863721, dtype: datetime64[ns]

### *2019 Data*

In [54]:
# read the 2019 baywheels data
baywheels2019_df = pd.read_csv('baywheels_2019', low_memory=False, index_col=0)

In [55]:
baywheels2019_df.head(2)

Unnamed: 0,start_time,end_time,start_station_id,start_station_lat,start_station_lon,end_station_id,end_station_lat,end_station_lon,bike_id
0,2019-10-28 09:46:47,2019-10-28 09:51:05,50.0,37.780526,-122.390288,453.0,37.7779336701,-122.3969730735,12424.0
1,2019-08-10 19:31:02,2019-08-10 19:35:05,285.0,37.7835208353,-122.4311578274,74.0,37.7764348192,-122.4262440205,1718.0


In [56]:
# number of rows before dropna
start_time_na_values = baywheels2019_df['start_time'].isna()
start_time_na_values

0          False
1          False
2          False
3          False
4          False
           ...  
2506998    False
2506999    False
2507000    False
2507001    False
2507002    False
Name: start_time, Length: 2507003, dtype: bool

In [57]:
# convert times into datetime64 objects
baywheels2019_df['start_time'] = pd.to_datetime(baywheels2019_df['start_time'], errors='coerce') 
baywheels2019_df['end_time'] = pd.to_datetime(baywheels2019_df['end_time'], errors='coerce')
                                               

In [58]:
# handle missing values
na_rows_start_time = baywheels2019_df['start_time'].isna()
na_rows_end_time = baywheels2019_df['end_time'].isna()

na_start_time = baywheels2019_df[na_rows_start_time]
na_end_time = baywheels2019_df[na_rows_end_time]
na_rows_time = pd.concat([na_start_time,na_end_time])

baywheels2019_df.dropna(subset='start_time', inplace=True)
baywheels2019_df['start_time'].info

<bound method Series.info of 0         2019-10-28 09:46:47
1         2019-08-10 19:31:02
2         2019-05-27 13:18:58
3         2019-07-24 22:47:58
4         2019-04-28 17:26:22
                  ...        
2506998   2019-04-12 19:11:48
2506999   2019-07-16 19:02:32
2507000   2019-05-29 16:03:26
2507001   2019-08-19 08:32:17
2507002   2019-04-02 17:00:36
Name: start_time, Length: 2506993, dtype: datetime64[ns]>