In [None]:
import pandas as pd
import numpy as np
import zipfile
import os
from math import radians
import seaborn as sns
import pathlib
from pathlib import Path
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter

### Create DataFrame

In [None]:
current_directory = Path.cwd()

data_directory = Path(current_directory, 'data')

zip_file_path = Path(data_directory, '202403-citibike-tripdata.csv.zip')

# Directory to extract the files
data_directory = Path(current_directory, 'data')

# List to hold DataFrames
dfs = []

# Open the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(data_directory)

# List all files in the extracted directory
extracted_files = os.listdir(data_directory)

# Filter only CSV files
csv_files = [file for file in extracted_files if file.endswith('.csv')]

# Read each CSV file into a DataFrame and append to dfs list
for file in csv_files:
    df = pd.read_csv(Path(data_directory, file))
    dfs.append(df)

# Concatenate all DataFrames in the list
biketrips = pd.concat(dfs, ignore_index=True)

In [None]:
biketrips.info()

### Flag trips over 30 min

In [None]:
# Convert start & end to datetime objects
biketrips['started_at'] = pd.to_datetime(biketrips['started_at'])
biketrips['ended_at'] = pd.to_datetime(biketrips['ended_at'])

# Calculate the time difference in minutes
biketrips['time_difference'] = round((biketrips['ended_at'] - biketrips['started_at']).dt.total_seconds() / 60, 2)

# Create a flag
biketrips['trips_over_30_min'] = biketrips['time_difference'] > 30

### Question 1: Number of trips covered by insurance

In [None]:
trips_over_30_min = biketrips['trips_over_30_min'].sum()
print("In March, " + str(trips_over_30_min) + " bike trips would be covered by insurance.")

### Question 2: Revenue

In [None]:
revenue = trips_over_30_min * 0.2
print("By charging 0.2 USD per trip exceeding 30 min, we anticipate revenue of " + str(revenue) + " USD.")

In [None]:
biketrips.info()
biketrips.head()

### Caluclate distances in km

In [None]:
def haversine(row):
    lng1, lat1, lng2, lat2 = map(radians, [row['start_lng'], row['start_lat'], row['end_lng'], row['end_lat']])
    dlng = lng2 - lng1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlng/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km

biketrips['distance'] = biketrips.apply(lambda row: haversine(row), axis=1)

### Flag distance buckets

In [None]:
biketrips['trips_between_0_1'] = (biketrips['distance'] > 0) & (biketrips['distance'] <= 1)
biketrips['trips_between_2_4'] = (biketrips['distance'] > 2) & (biketrips['distance'] <= 4)
biketrips['trips_between_4_9'] = (biketrips['distance'] > 4) & (biketrips['distance'] <= 9)
biketrips['trips_over_10'] = biketrips['distance'] > 10

### Create DataFrame Distance Buckets

In [None]:
sums = biketrips[['trips_between_0_1', 'trips_between_2_4', 'trips_between_4_9', 'trips_over_10']].sum()
sums_df = pd.DataFrame(sums, columns=['Number of Trips'])

# Reset index to make 'Distance Buckets' a regular column
sums_df.reset_index(inplace=True)
sums_df.rename(columns={'index': 'Distance Buckets'}, inplace=True)

### Question 3: Plot barchart Distance Buckets

In [None]:
plt.figure(figsize=(10, 6))
chart = sns.barplot(x='Distance Buckets', y='Number of Trips', hue= 'Distance Buckets', data=sums_df, palette='coolwarm')

chart.yaxis.set_major_formatter(ScalarFormatter(useMathText=True))
chart.ticklabel_format(style='plain', axis='y')

plt.title('Number of Bike Trips by Distance Buckets')
plt.xlabel('Distance Buckets')
plt.ylabel('Number of Trips')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.3)
plt.tight_layout()
plt.show()