In [2]:
# Import the modules
import pandas as pd
from datetime import datetime

# Create the list of data files to loop through to read in and process
start_year = 2014
csv_list = []
for year in range(start_year, 2024):
    for month in range(1, 10):
        csv_list.append(str(year) + '0' + str(month) + '-citibike-tripdata.csv')
    for month in range(10, 13):
        csv_list.append(str(year) + str(month) + '-citibike-tripdata.csv')
        
#################################################################

#  DataFrame used to accumulate stats from each month        
Citidata_df = pd.DataFrame()

#################################################################

# the following lists are used to account for various naming conventions across the datasets
start_column_names_list = ['starttime', 'started_at', 'Start Time']
stop_column_names_list = ['stoptime', 'ended_at', 'Stop Time']
usertype_column_names_list = ['usertype', 'member_casual', 'User Type']
duration_column_names_list = ['tripduration', 'Trip Duration']

# the following are the chosen convention for names of the variables of interest
start_time_chosen_name = 'starttime'
stop_time_chosen_name = 'stoptime'
user_type_chosen_name = 'usertype'
duration_chosen_name = 'tripduration'

#################################################################
#    Function that standardizes the following:
#        'starttime' column
#        'stoptime' column
#        'usertype' column - it will standardize the user type to 'member' or 'casual'
#        'tripduration' column - it wil create this column if not in the dataset       

def standardize_data(df):
    
    for i in range(0, len(start_column_names_list)):
        if start_column_names_list[i] in df.columns:
            df = df.rename(columns={start_column_names_list[i]: 'starttime'})

    for i in range(0, len(stop_column_names_list)):
        if stop_column_names_list[i] in df.columns:
            df = df.rename(columns={stop_column_names_list[i]: 'stoptime'})

    for i in range(0, len(usertype_column_names_list)):
        if usertype_column_names_list[i] in df.columns:
            df = df.rename(columns={usertype_column_names_list[i]: 'usertype'})
            
    # change 'Subscriber' to 'Member' and 'Customer' to 'Casual' if needed
    if df['usertype'][0] == 'Subscriber' or df['usertype'][0] == 'Customer':    
        df.loc[df['usertype'] == 'Subscriber', 'usertype'] = 'member'
        df.loc[df['usertype'] == 'Customer', 'usertype'] = 'casual' 

    # correctly name the trip duration column if it exists; create duration column if it does not exist    
    duration_column_exists = 0    
    for i in range(0, len(duration_column_names_list)):
        if duration_column_names_list[i] in df.columns:
            df = df.rename(columns={duration_column_names_list[i]: duration_chosen_name})
            duration_column_exists = 1
    # if duration column does not exist, calculate and create one
    if duration_column_exists == 0:
        df[start_time_chosen_name] = pd.to_datetime(df[start_time_chosen_name])
        df[stop_time_chosen_name] = pd.to_datetime(df[stop_time_chosen_name])
        df[duration_chosen_name] = (df[stop_time_chosen_name] - df[start_time_chosen_name]) / pd.Timedelta(seconds=1)
        
    return df

#################################################################
#  Function that runs through a month of daily data and acquires the following stats columns:
#      'Start Day'
#      'Member Count'
#      'Casual Count'
#      'Member Duration Avg'
#      'Casual Duration Avg'

def create_monthly_stats(df): 
    # Eliminate the time portion of datetime
    df['Start Day'] = pd.to_datetime(df[start_time_chosen_name]).dt.date
    
    # Choose only the type of user, the start day, and the trip duration
    narrowed_df = df[[user_type_chosen_name, 'Start Day', duration_chosen_name]]

    # Loop through each start day and gather counts of member trips, 
    # casual trips, and duration of member and casual trips
    day_member_count_list = []
    day_casual_count_list = []
    duration_member_ave_list = []
    duration_casual_ave_list = []
    
    # create the day list - this will be 28, 29, 30, or 31 in length, based on the data for that month
    day_list = narrowed_df['Start Day'].unique()

    for i in range(0, len(day_list)):
        # count the number of member trips and casual trips for each day
        member_count = len(narrowed_df[(narrowed_df[user_type_chosen_name] == 'member') & (narrowed_df['Start Day'] == day_list[i])])        
        casual_count = len(narrowed_df[(narrowed_df[user_type_chosen_name] == 'casual') & (narrowed_df['Start Day'] == day_list[i])])
        
        # find the average trip duration for members and casuals
        duration_member_ave = narrowed_df.loc[(narrowed_df[user_type_chosen_name] == 'member') & (narrowed_df['Start Day'] == day_list[i]), duration_chosen_name].mean()
        duration_casual_ave = narrowed_df.loc[(narrowed_df[user_type_chosen_name] == 'casual') & (narrowed_df['Start Day'] == day_list[i]), duration_chosen_name].mean()
        
        # populate the lists of counts and averages
        day_member_count_list.append(member_count)
        day_casual_count_list.append(casual_count)
        
        # save the duration in minutes
        duration_member_ave_list.append(round(duration_member_ave/60, 2))
        duration_casual_ave_list.append(round(duration_casual_ave/60, 2))
                                      
    # create the new df of stats, including the start date
    stats = pd.DataFrame({'Start Day': day_list,
                          'Member Count': day_member_count_list,
                          'Casual Count': day_casual_count_list,
                          'Member Duration Avg': duration_member_ave_list,
                          'Casual Duration Avg': duration_casual_ave_list
                         })
    return stats
    
##############################################################
# main code to loop through all of the months of data in csv_list

for dataset in range(0, len(csv_list)):
    
    # Bring in one csv file at a time
    file_path = csv_list[dataset]
    
    # Read the csv file into a DataFrame
    ### warning errors ocurred concerning mismatching data types and suggested using 'low_memory=False'
    # by_month_df = pd.read_csv(file_path)
    by_month_df = pd.read_csv(file_path, low_memory=False)
 
    # call function to standardize column names and values across different formats
    standard_month_df = standardize_data(by_month_df)
    
    # send the standardized dataframe to crete the stats for that month
    by_month_stats_df = create_monthly_stats(standard_month_df)
    
    # Accumulate/concatenate the monthly stats into a dataframe
    Citidata_df = pd.concat([Citidata_df, by_month_stats_df], ignore_index=True)

# Sort the full dataframe by Start Day
Citidata_df_sorted = Citidata_df.sort_values(by='Start Day')

# Finally, save the full stats file to csv
Citidata_df_sorted.to_csv('data_2014_2023.csv', index=False)