In [2]:
# Import 3rd party libraries
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', None)

In [3]:
def drop_features(data,keep_columns=None,remove_columns=None):
  if(remove_columns==None):
    return data[keep_columns]
  if(keep_columns==None):
    return data.drop(columns=remove_columns)

In [4]:
trips_filenames = [filename for filename in os.listdir()]
bad_filenames = []
trips_data = pd.DataFrame()
for file in trips_filenames:
    if("bike_share" not in file):
      continue
    
    print(file)  
    temp_data = pd.read_csv(file)
    # rename columns for consistency
    if("trip_start_time" in temp_data.columns):
       temp_data = temp_data.rename(columns = {'trip_start_time':'Start Time'})
    if('from_station_id' in temp_data.columns):
       temp_data = temp_data.rename(columns = {'from_station_id':'Start Station Id'})

    # remove unnecessary columns
    temp_data = drop_features(temp_data,keep_columns = ["Start Time","Start Station Id"], remove_columns=None)

    # change "Start Time" to a datetime object
    time_zone = temp_data["Start Time"].iloc[0]
    time_zone = time_zone[len(time_zone)-4:len(time_zone)-1]
    print("Time Zone",time_zone)
    temp_data["Start Time"] = temp_data["Start Time"].str[:-6]

    try:
        temp_data["Start Time"] = pd.DatetimeIndex(temp_data["Start Time"],dayfirst=True).tz_localize(time_zone).tz_convert("EST")
    except:
        print("File:",file,"had an error in converting to DateTime")
        bad_filenames.append(file)
        continue

        
    # reduce temporal frequency to once per hour
    temp_data["num_trips"] = np.ones(len(temp_data))
    temp_data = temp_data.groupby(temp_data['Start Time'].dt.floor('H')).agg({'num_trips':'sum'})

    trips_data = pd.concat([trips_data,temp_data])
    print(trips_data.tail())

bike_share_2018-1.csv
Time Zone UTC
                           num_trips
Start Time                          
2018-01-31 19:00:00-05:00       92.0
2018-01-31 20:00:00-05:00       75.0
2018-01-31 21:00:00-05:00       73.0
2018-01-31 22:00:00-05:00       44.0
2018-01-31 23:00:00-05:00       25.0
bike_share_2018-10.csv
Time Zone UTC
                           num_trips
Start Time                          
2018-10-31 19:00:00-05:00      267.0
2018-10-31 20:00:00-05:00      231.0
2018-10-31 21:00:00-05:00      184.0
2018-10-31 22:00:00-05:00      132.0
2018-10-31 23:00:00-05:00       89.0
bike_share_2018-3.csv
Time Zone UTC
                           num_trips
Start Time                          
2018-03-31 19:00:00-05:00       37.0
2018-03-31 20:00:00-05:00       25.0
2018-03-31 21:00:00-05:00       39.0
2018-03-31 22:00:00-05:00       40.0
2018-03-31 23:00:00-05:00       22.0
bike_share_2018-4.csv
Time Zone UTC
                           num_trips
Start Time                          
2018

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_data["Start Time"] = temp_data["Start Time"].str[:-6]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_data["Start Time"] = pd.DatetimeIndex(temp_data["Start Time"],dayfirst=True).tz_localize(time_zone).tz_convert("EST")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_data["num_trips"] 

                           num_trips
Start Time                          
2018-12-31 19:00:00-05:00       17.0
2018-12-31 20:00:00-05:00        9.0
2018-12-31 21:00:00-05:00       10.0
2018-12-31 22:00:00-05:00       14.0
2018-12-31 23:00:00-05:00       16.0
bike_share_2018-8.csv
Time Zone UTC
                           num_trips
Start Time                          
2018-08-31 19:00:00-05:00      638.0
2018-08-31 20:00:00-05:00      448.0
2018-08-31 21:00:00-05:00      336.0
2018-08-31 22:00:00-05:00      246.0
2018-08-31 23:00:00-05:00      218.0
bike_share_2018-9.csv
Time Zone UTC
                           num_trips
Start Time                          
2018-09-30 19:00:00-05:00      240.0
2018-09-30 20:00:00-05:00      163.0
2018-09-30 21:00:00-05:00      118.0
2018-09-30 22:00:00-05:00       48.0
2018-09-30 23:00:00-05:00       20.0
bike_share_2018-2.csv
Time Zone UTC
                           num_trips
Start Time                          
2018-02-28 19:00:00-05:00      192.0
2018

In [5]:
safe_data = trips_data.copy().sort_values(by='Start Time').copy()
#print(safe_data)

In [6]:
print(len(safe_data))

8722


In [7]:
temp_df = pd.DataFrame()
for file in bad_filenames:
    if("bike_share" not in file):
      continue
    
    print(file)  
    temp_data = pd.read_csv(file)
    # rename columns for consistency
    if("trip_start_time" in temp_data.columns):
       temp_data = temp_data.rename(columns = {'trip_start_time':'Start Time'})
    if('from_station_id' in temp_data.columns):
       temp_data = temp_data.rename(columns = {'from_station_id':'Start Station Id'})

    # remove unnecessary columns
    temp_data = drop_features(temp_data,keep_columns = ["Start Time","Start Station Id"], remove_columns=None)
    # change "Start Time" to a datetime object
    time_zone = temp_data["Start Time"].iloc[0]
    time_zone = time_zone[len(time_zone)-4:len(time_zone)-1]
    print("Time Zone",time_zone)
    temp_data["Start Time"] = temp_data["Start Time"].str[:-6]
    temp_data['Start Time'] = pd.to_datetime(temp_data['Start Time'],dayfirst=True, errors='coerce')
    print("There are",len(temp_data["Start Time"].isna()),"unformatable rows")
    temp_data = temp_data.dropna(subset=["Start Time"])
    temp_data["Start Time"] = temp_data["Start Time"].dt.tz_localize('EST')
        
    # reduce temporal frequency to once per hour
    temp_data["num_trips"] = np.ones(len(temp_data))
    temp_data = temp_data.groupby(temp_data['Start Time'].dt.floor('H')).agg({'num_trips':'sum'})

    temp_df = pd.concat([temp_df,temp_data])
    print(temp_df.tail())

In [8]:
trips_data.to_csv('trips_data_(2_cols)_2018.csv',index=True)