Importing modules

In [1]:
import pandas as pd
import glob

Concatenating all bike rental data into one dataframe

In [2]:
path = '/Users/ethanenkhtur/Documents/Codecademy/bike-rental-starter-kit/data/JC*'

csv_files = glob.glob(path)

dataframes = []

for file in csv_files:
    df = pd.read_csv(file)
    dataframes.append(df)
    
df = pd.concat(dataframes, ignore_index=True)

In [3]:
df['id'] = df.index

In [4]:
df.head(3)

Unnamed: 0,Trip Duration,Start Time,Stop Time,Start Station ID,Start Station Name,Start Station Latitude,Start Station Longitude,End Station ID,End Station Name,End Station Latitude,End Station Longitude,Bike ID,User Type,Birth Year,Gender,id
0,361,2016-02-01 00:31:18,2016-02-01 00:37:19,3202,Newport PATH,40.727224,-74.033759,3203,Hamilton Park,40.727596,-74.044247,24393,Subscriber,1975.0,1,0
1,297,2016-02-01 01:55:05,2016-02-01 02:00:02,3195,Sip Ave,40.730743,-74.063784,3194,McGinley Square,40.72534,-74.067622,24394,Subscriber,1985.0,2,1
2,1155,2016-02-01 02:40:05,2016-02-01 02:59:20,3183,Exchange Place,40.716247,-74.033459,3210,Pershing Field,40.742677,-74.051789,24676,Subscriber,1976.0,1,2


How much missing data there is

In [5]:
df.isna().sum()

Trip Duration                  0
Start Time                     0
Stop Time                      0
Start Station ID               0
Start Station Name             0
Start Station Latitude         0
Start Station Longitude        0
End Station ID                 0
End Station Name               0
End Station Latitude           0
End Station Longitude          0
Bike ID                        0
User Type                    380
Birth Year                 18999
Gender                         0
id                             0
dtype: int64

In [6]:
df.dtypes

Trip Duration                int64
Start Time                  object
Stop Time                   object
Start Station ID             int64
Start Station Name          object
Start Station Latitude     float64
Start Station Longitude    float64
End Station ID               int64
End Station Name            object
End Station Latitude       float64
End Station Longitude      float64
Bike ID                      int64
User Type                   object
Birth Year                 float64
Gender                       int64
id                           int64
dtype: object

DataFrame could use some fixing in terms of each column being of right type such as start and stop time.

Let's do that!

In [7]:
df['Start Time'] = pd.to_datetime(df['Start Time'])
df['Stop Time'] = pd.to_datetime(df['Stop Time'])
df.dtypes

Trip Duration                       int64
Start Time                 datetime64[ns]
Stop Time                  datetime64[ns]
Start Station ID                    int64
Start Station Name                 object
Start Station Latitude            float64
Start Station Longitude           float64
End Station ID                      int64
End Station Name                   object
End Station Latitude              float64
End Station Longitude             float64
Bike ID                             int64
User Type                          object
Birth Year                        float64
Gender                              int64
id                                  int64
dtype: object

In [8]:
df.describe()

Unnamed: 0,Trip Duration,Start Time,Stop Time,Start Station ID,Start Station Latitude,Start Station Longitude,End Station ID,End Station Latitude,End Station Longitude,Bike ID,Birth Year,Gender,id
count,247584.0,247584,247584,247584.0,247584.0,247584.0,247584.0,247584.0,247584.0,247584.0,228585.0,247584.0,247584.0
mean,885.6305,2016-07-29 05:55:07.541335296,2016-07-29 06:09:53.671073792,3207.065206,40.723121,-74.046438,3203.572553,40.722594,-74.045855,24935.260481,1979.335276,1.123534,123791.5
min,61.0,2016-01-01 00:02:52,2016-01-01 00:08:54,3183.0,40.69264,-74.096937,147.0,40.692216,-74.096937,14552.0,1900.0,0.0,0.0
25%,248.0,2016-05-27 07:46:06,2016-05-27 07:54:40.249999872,3186.0,40.717732,-74.050656,3186.0,40.71654,-74.050444,24491.0,1974.0,1.0,61895.75
50%,390.0,2016-08-10 09:23:50,2016-08-10 09:34:32.500000,3201.0,40.721525,-74.044247,3199.0,40.721124,-74.043117,24609.0,1981.0,1.0,123791.5
75%,666.0,2016-10-05 17:25:05.500000,2016-10-05 17:33:00.750000128,3211.0,40.727596,-74.038051,3211.0,40.727224,-74.036486,24719.0,1986.0,1.0,185687.25
max,16329810.0,2016-12-31 23:44:50,2017-01-18 14:26:46,3426.0,40.752559,-74.032108,3426.0,40.801343,-73.95739,27274.0,2000.0,2.0,247583.0
std,35937.98,,,26.955103,0.008199,0.011211,61.579494,0.007958,0.011283,748.469712,9.596809,0.518687,71471.488861


#### Columns to investigate further:

- `Birth Year` (suspiciously small minimum, missing data)
- `User Type` (missing data)
- `Trip Duration` (suspiciously large maximum)
- `Gender` (what are the unknowns)

First some housekeeping to remove spaces and uppercase letters from the column names.

In [9]:
# updating column names
df.columns = [x.replace(' ','_').lower() for x in df.columns]

#### Investigation of `birth_year`

In [10]:
df['age'] = 2016 - df.birth_year.values
df['age'].sort_values(ascending=False)[:10]

186245    116.0
31008      82.0
72509      79.0
184757     79.0
173695     79.0
157596     79.0
226979     76.0
143125     76.0
170642     76.0
148238     75.0
Name: age, dtype: float64

There is a single outlier of 116. It is safe to remove that record since riders of that age is not expected.

In [11]:
df.drop([186245], inplace=True)

Not let's look at the missing `birth_year` data

In [10]:
df[df['user_type'].isna()]

Unnamed: 0,trip_duration,start_time,stop_time,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_latitude,end_station_longitude,bike_id,user_type,birth_year,gender,id
8546,943,2016-08-01 09:48:21,2016-08-01 10:04:05,3214,Essex Light Rail,40.712774,-74.036486,3199,Newport Pkwy,40.728745,-74.032108,24609,,1987.0,1,8546
8634,2319,2016-08-01 12:29:23,2016-08-01 13:08:03,3199,Newport Pkwy,40.728745,-74.032108,3184,Paulus Hook,40.714145,-74.033552,24609,,1987.0,1,8634
10848,365,2016-08-03 10:46:37,2016-08-03 10:52:43,3214,Essex Light Rail,40.712774,-74.036486,3202,Newport PATH,40.727224,-74.033759,24724,,1987.0,1,10848
13241,396,2016-08-05 10:20:57,2016-08-05 10:27:34,3214,Essex Light Rail,40.712774,-74.036486,3202,Newport PATH,40.727224,-74.033759,24530,,1987.0,1,13241
13385,581,2016-08-05 14:06:10,2016-08-05 14:15:51,3202,Newport PATH,40.727224,-74.033759,3183,Exchange Place,40.716247,-74.033459,24668,,1987.0,1,13385
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234992,141,2016-05-13 09:25:49,2016-05-13 09:28:11,3214,Essex Light Rail,40.712774,-74.036486,3183,Exchange Place,40.716247,-74.033459,24436,,1987.0,1,234992
235566,126,2016-05-14 12:13:39,2016-05-14 12:15:45,3183,Exchange Place,40.716247,-74.033459,3214,Essex Light Rail,40.712774,-74.036486,24437,,1987.0,1,235566
235577,142,2016-05-14 12:22:41,2016-05-14 12:25:04,3214,Essex Light Rail,40.712774,-74.036486,3183,Exchange Place,40.716247,-74.033459,24437,,1987.0,1,235577
237296,143,2016-05-17 08:19:54,2016-05-17 08:22:17,3214,Essex Light Rail,40.712774,-74.036486,3183,Exchange Place,40.716247,-74.033459,24606,,1987.0,1,237296


In [None]:
df.count()

Trip Duration              247584
Start Time                 247584
Stop Time                  247584
Start Station ID           247584
Start Station Name         247584
Start Station Latitude     247584
Start Station Longitude    247584
End Station ID             247584
End Station Name           247584
End Station Latitude       247584
End Station Longitude      247584
Bike ID                    247584
User Type                  247204
Birth Year                 228585
Gender                     247584
dtype: int64

In [None]:
weather_file = '/Users/ethanenkhtur/Documents/Codecademy/bike-rental-starter-kit/data/newark_airport_2016.csv'

weather_df = pd.read_csv(weather_file)

weather_df.head()

Unnamed: 0,STATION,NAME,DATE,AWND,PGTM,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN,TSUN,WDF2,WDF5,WSF2,WSF5
0,USW00014734,"NEWARK LIBERTY INTERNATIONAL AIRPORT, NJ US",2016-01-01,12.75,,0.0,0.0,0.0,41,43,34,,270,280.0,25.9,35.1
1,USW00014734,"NEWARK LIBERTY INTERNATIONAL AIRPORT, NJ US",2016-01-02,9.4,,0.0,0.0,0.0,36,42,30,,260,260.0,21.0,25.1
2,USW00014734,"NEWARK LIBERTY INTERNATIONAL AIRPORT, NJ US",2016-01-03,10.29,,0.0,0.0,0.0,37,47,28,,270,250.0,23.9,30.0
3,USW00014734,"NEWARK LIBERTY INTERNATIONAL AIRPORT, NJ US",2016-01-04,17.22,,0.0,0.0,0.0,32,35,14,,330,330.0,25.9,33.1
4,USW00014734,"NEWARK LIBERTY INTERNATIONAL AIRPORT, NJ US",2016-01-05,9.84,,0.0,0.0,0.0,19,31,10,,360,350.0,25.1,31.1
