In [None]:
# This project is for the assignment in the Udacity course "Programming for Data Science with Python"
# Author: Chen Wang
# Date: 2023-08-07
# Library: time, pandas numpy

In [1]:
import time
import pandas as pd
import numpy as np

In [2]:
CITY_DATA = { 'chicago': './chicago.csv',
              'nyc': './new_york_city.csv',
              'washington': './washington.csv' }

def get_filters():
    """
    Asks user to specify a city, month, and day to analyze.

    Returns:
        (str) city - name of the city to analyze
        (str) month - name of the month to filter by, or "all" to apply no month filter
        (str) day - name of the day of week to filter by, or "all" to apply no day filter
    """
    print('Hello! Let\'s explore some US bikeshare data!')

    # get user input for city (chicago, new york city, washington). HINT: Use a while loop to handle invalid inputs
    while True:
        city = input('Enter the city name (chicago, nyc, washington): ').lower()
        if city in CITY_DATA:
            break
        else:
            print('Invalid city name. Please choose from chicago, nyc, or washington.')

    # get user input for month (all, january, february, ... , june)
    while True:
        month = input('Enter the month (january, february, ..., december) or "all" for no month filter: ').lower()
        months_list = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december', 'all']
        if month in months_list:
            break
        else:
            print('Invalid month name. Please choose a month from january to december or enter "all".')

    # get user input for day of week (all, monday, tuesday, ... sunday)
    while True:
        day = input('Enter the day of the week (monday, tuesday, ..., sunday) or "all" for no day filter: ').lower()
        days_list = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday', 'all']
        if day in days_list:
            break
        else:
            print('Invalid day name. Please choose a day from monday to sunday or enter "all".')

    print('-'*40)
    return city, month, day

In [3]:
def load_data(city, month, day):
    """
    Loads data for the specified city and filters by month and day if applicable.

    Args:
        (str) city - name of the city to analyze
        (str) month - name of the month to filter by, or "all" to apply no month filter
        (str) day - name of the day of week to filter by, or "all" to apply no day filter
    Returns:
        df - Pandas DataFrame containing city data filtered by month and day
    """
    # Load the data for the specified city into a DataFrame

    df = pd.read_csv(CITY_DATA[city])

    # Convert the "Start Time" column to datetime format to easily extract month and day
    df['Start Time'] = pd.to_datetime(df['Start Time'])

    # Extract month and day of the week from the "Start Time" column
    df['Month'] = df['Start Time'].dt.month_name()
    df['Day'] = df['Start Time'].dt.day_name()
    df['Hour'] = df['Start Time'].dt.hour

    # Filter by month if applicable
    if month != 'all':
        df = df[df['Month'] == month.capitalize()]

    # Filter by day of the week if applicable
    if day != 'all':
        df = df[df['Day'] == day.capitalize()]

    return df


In [6]:
def display_data(df):
    """
    Ask the user if they want to see 5 lines of raw data,
    Display that data if the answer is 'yes',
    Continue iterating these prompts and displaying the next 5 lines of raw data at each iteration,
    Stop the program when the user says 'no' or there is no more raw data to display.
    """ 
    # Create an iterator to traverse the data in steps of 5 rows
    data_iterator = iter(range(0, len(df), 5))

    while True:
        try:
            # Get the next 5 rows using the iterator
            start_idx = next(data_iterator)
            print("\nShow the first 5 rows of raw data after row number {}.\n".format(start_idx))
            print(df.iloc[start_idx : start_idx + 5])
        except StopIteration:
            # If there is no more raw data to display
            print("No more data to display!")
            break
            
        restart = input('\nWould you like to see the next 5 lines? Enter yes or no.\n').lower()
        if restart != 'yes':
            break

In [9]:
def time_stats(df):
    """Displays statistics on the most frequent times of travel."""

    print('\nCalculating The Most Frequent Times of Travel...\n')
    start_time = time.time()
    

    # Display the most common month
    most_common_month = df['Month'].mode()[0]
    print("The most common month for travel is: {}".format(most_common_month))

    # Display the most common day of the week
    most_common_day = df['Day'].mode()[0]
    print("The most common day of the week for travel is: {}".format(most_common_day))

    # Display the most common start hour
    most_common_hour = df['Hour'].mode()[0]
    print("The most common start hour for travel is: {}:00".format(most_common_hour))

    print("\nThis took %s seconds." % (time.time() - start_time))
    print('-'*40)


In [11]:
def station_stats(df):
    """Displays statistics on the most popular stations and trip."""

    print('\nCalculating The Most Popular Stations and Trip...\n')
    start_time = time.time()

    # display most commonly used start station
    most_common_start_station = df['Start Station'].mode()[0]
    print("The most common start station for travel is: {}".format(most_common_start_station))

    # display most commonly used end station
    most_common_end_station = df['End Station'].mode()[0]
    print("The most common end station for travel is: {}".format(most_common_end_station))

    # display most frequent combination of start station and end station trip
    most_common_combo = df.groupby(['Start Station', 'End Station']).size().idxmax()
    print("The most frequent combination of start station and end station is:")
    print("Start Station: {}".format(most_common_combo[0]))
    print("End Station: {}".format(most_common_combo[1]))

    print("\nThis took %s seconds." % (time.time() - start_time))
    print('-'*40)


In [13]:
def trip_duration_stats(df):
    """Displays statistics on the total and average trip duration."""

    print('\nCalculating Trip Duration...\n')
    start_time = time.time()

    # display total travel time
    total_travel_time= df['Trip Duration'].sum()
    print("The total travel time is: {}".format(total_travel_time))
    
    # display mean travel time
    mean_travel_time= df['Trip Duration'].mean()
    print("The average travel time is: {}".format(mean_travel_time))
    
    print("\nThis took %s seconds." % (time.time() - start_time))
    print('-'*40)

In [15]:
def user_stats(df):
    """Displays statistics on bikeshare users."""

    print('\nCalculating User Stats...\n')
    start_time = time.time()

    # Display counts of user types
    user_type_counts = df['User Type'].value_counts()
    print("Counts of User Types:")
    print(user_type_counts)

    # Display counts of gender
    # Need to check gender info because washington do not have gender info
    if 'Gender' in df:
        gender_counts = df['Gender'].value_counts()
        print("\nCounts of Gender:")
        print(gender_counts)
    else:
        print("\nThere is no gender information in this dataset.")
    
    # Display earliest, most recent, and most common year of birth
    # Need to check birth year info because washington do not have birth year info
    if 'Birth Year' in df:
        earliest_birth_year = df['Birth Year'].min()
        most_recent_birth_year = df['Birth Year'].max()
        most_common_birth_year = df['Birth Year'].mode()[0]
        print("\nEarliest Birth Year: {}".format(int(earliest_birth_year)))
        print("Most Recent Birth Year: {}".format(int(most_recent_birth_year)))
        print("Most Common Birth Year: {}".format(int(most_common_birth_year)))
    else:
        print("\nThere is no birth year information in this dataset.")
    
    print("\nThis took %s seconds." % (time.time() - start_time))
    print('-'*40)


In [17]:
#This code below is amazing!!
def main():
    while True:
        city, month, day = get_filters()
        df = load_data(city, month, day)
        display_data(df)

        time_stats(df)
        station_stats(df)
        trip_duration_stats(df)
        user_stats(df)

        restart = input('\nWould you like to restart? Enter yes or no.\n')
        if restart.lower() != 'yes':
            break

# what does the command below mean?

if __name__ == "__main__":
    main()


Hello! Let's explore some US bikeshare data!
Enter the city name (chicago, nyc, washington): chicago
Enter the month (january, february, ..., december) or "all" for no month filter: all
Enter the day of the week (monday, tuesday, ..., sunday) or "all" for no day filter: all
----------------------------------------

Show the first 5 rows of raw data after row number 0.

   Unnamed: 0          Start Time             End Time  Trip Duration  \
0     1423854 2017-06-23 15:09:32  2017-06-23 15:14:53            321   
1      955915 2017-05-25 18:19:03  2017-05-25 18:45:53           1610   
2        9031 2017-01-04 08:27:49  2017-01-04 08:34:45            416   
3      304487 2017-03-06 13:49:38  2017-03-06 13:55:28            350   
4       45207 2017-01-17 14:53:07  2017-01-17 15:02:01            534   

                   Start Station                   End Station   User Type  \
0           Wood St & Hubbard St       Damen Ave & Chicago Ave  Subscriber   
1            Theater on the Lake 