In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
ride_sharing = pd.read_csv('ride_sharing_new.csv',index_col=0)

In [33]:
# Print the information of ride_sharing
print(ride_sharing.info())

# Print summary statistics of user_type column
print(ride_sharing['user_type'].describe())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25760 entries, 0 to 25759
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   duration         25760 non-null  object
 1   station_A_id     25760 non-null  int64 
 2   station_A_name   25760 non-null  object
 3   station_B_id     25760 non-null  int64 
 4   station_B_name   25760 non-null  object
 5   bike_id          25760 non-null  int64 
 6   user_type        25760 non-null  int64 
 7   user_birth_year  25760 non-null  int64 
 8   user_gender      25760 non-null  object
dtypes: int64(5), object(4)
memory usage: 2.0+ MB
None
count    25760.000000
mean         2.008385
std          0.704541
min          1.000000
25%          2.000000
50%          2.000000
75%          3.000000
max          3.000000
Name: user_type, dtype: float64


In [34]:
# Strip duration of minutes
ride_sharing['duration_trim'] = ride_sharing['duration'].str.strip('minutes')

# Convert duration to integer
ride_sharing['duration_time'] = ride_sharing['duration_trim'].astype('int')

# Write an assert statement making sure of conversion
assert ride_sharing['duration_time'].dtype == 'int'

# Print formed columns and calculate average ride duration 
print(ride_sharing[['duration','duration_trim','duration_time']])
print(ride_sharing['duration_time'].mean())

         duration duration_trim  duration_time
0      12 minutes           12              12
1      24 minutes           24              24
2       8 minutes            8               8
3       4 minutes            4               4
4      11 minutes           11              11
...           ...           ...            ...
25755  11 minutes           11              11
25756  10 minutes           10              10
25757  14 minutes           14              14
25758  14 minutes           14              14
25759  29 minutes           29              29

[25760 rows x 3 columns]
11.389052795031056


In [35]:
ride_sharing['tire_sizes'] = np.random.choice(['26','27'], ride_sharing.shape[0])

In [36]:
# Convert tire_sizes to integer
ride_sharing['tire_sizes'] =  ride_sharing['tire_sizes'].astype('int')
# Set all values above 27 to 27
ride_sharing.loc[ride_sharing['tire_sizes'] > 27, 'tire_sizes'] = 27

# Reconvert tire_sizes back to categorical
ride_sharing['tire_sizes'] = ride_sharing['tire_sizes'].astype('category') 

# Print tire size description
print(ride_sharing['tire_sizes'].describe())

count     25760
unique        2
top          26
freq      12934
Name: tire_sizes, dtype: int64


In [37]:
ride_sharing['ride_date'] = np.random.choice(pd.date_range('2020-10-01', '2022-10-31'), ride_sharing.shape[0])

In [38]:
# Convert ride_date to date
ride_sharing['ride_dt'] =  pd.to_datetime(ride_sharing['ride_date']).dt.date

import  datetime as dt
# Save today's date
today = dt.date.today()

# Set all in the future to today's date
ride_sharing.loc[ride_sharing['ride_dt'] > today, 'ride_dt'] = today

# Print maximum of ride_dt column
print(ride_sharing['ride_dt'].max())
ride_sharing.head()

2022-10-31


Unnamed: 0,duration,station_A_id,station_A_name,station_B_id,station_B_name,bike_id,user_type,user_birth_year,user_gender,duration_trim,duration_time,tire_sizes,ride_date,ride_dt
0,12 minutes,81,Berry St at 4th St,323,Broadway at Kearny,5480,2,1959,Male,12,12,26,2022-04-19,2022-04-19
1,24 minutes,3,Powell St BART Station (Market St at 4th St),118,Eureka Valley Recreation Center,5193,2,1965,Male,24,24,27,2020-10-31,2020-10-31
2,8 minutes,67,San Francisco Caltrain Station 2 (Townsend St...,23,The Embarcadero at Steuart St,3652,3,1993,Male,8,8,27,2021-04-18,2021-04-18
3,4 minutes,16,Steuart St at Market St,28,The Embarcadero at Bryant St,1883,1,1979,Male,4,4,26,2020-12-19,2020-12-19
4,11 minutes,22,Howard St at Beale St,350,8th St at Brannan St,4626,2,1994,Male,11,11,26,2021-02-09,2021-02-09


In [39]:
ride_sharing['ride_id'] = np.random.choice(range(1,50), ride_sharing.shape[0])

In [40]:
# Find duplicates
duplicates = ride_sharing.duplicated(subset='ride_id', keep = False)

# Sort your duplicated rides
duplicated_rides =  ride_sharing[duplicates].sort_values('ride_id')

# Print relevant columns of duplicated_rides
print(duplicated_rides[['ride_id', 'duration' , 'user_birth_year']])

       ride_id    duration  user_birth_year
23189        1   9 minutes             1992
21352        1   6 minutes             1982
3148         1  12 minutes             1970
1436         1   9 minutes             1980
21371        1   5 minutes             1960
...        ...         ...              ...
16150       49   8 minutes             1972
3133        49   7 minutes             1990
3146        49   9 minutes             1980
2418        49   6 minutes             1977
6234        49  12 minutes             1987

[25760 rows x 3 columns]


In [43]:
ride_sharing.dtypes


duration                   object
station_A_id                int64
station_A_name             object
station_B_id                int64
station_B_name             object
bike_id                     int64
user_type                   int64
user_birth_year             int64
user_gender                object
duration_trim              object
duration_time               int32
tire_sizes               category
ride_date          datetime64[ns]
ride_dt                    object
ride_id                     int32
dtype: object

In [44]:
# Drop complete duplicates from ride_sharing
ride_dup = ride_sharing.drop_duplicates()

# Create statistics dictionary for aggregation function
statistics = {'user_birth_year': 'min', 'duration_time': 'mean'}

# Group by ride_id and compute new statistics
ride_unique = ride_dup.groupby('ride_id').agg(statistics).reset_index()

# Find duplicated values again
duplicates = ride_unique.duplicated(subset = 'ride_id', keep = False)
duplicated_rides = ride_unique[duplicates == True]

# Assert duplicates are processed
assert duplicated_rides.shape[0] == 0