In [1]:
import pandas as pd
import numpy as np
import datetime as dt

## Data type constraints


### Numeric data or ... ?

In [2]:
ride_sharing = pd.read_csv('./dataset/ride_sharing_tire_sizes.csv', index_col='Unnamed: 0')

In [3]:
# Print the information of ride_sharing
print(ride_sharing.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25760 entries, 0 to 25759
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   duration         25760 non-null  object 
 1   station_A_id     25760 non-null  int64  
 2   station_A_name   25760 non-null  object 
 3   station_B_id     25760 non-null  int64  
 4   station_B_name   25760 non-null  object 
 5   bike_id          25760 non-null  int64  
 6   user_type        25760 non-null  object 
 7   user_birth_year  25760 non-null  int64  
 8   user_gender      25760 non-null  object 
 9   tire_sizes       25760 non-null  float64
dtypes: float64(1), int64(4), object(5)
memory usage: 2.2+ MB
None


In [4]:
# Print summary statistics of user_type column
print(ride_sharing['user_type'].describe())

count          25760
unique             2
top       Subscriber
freq           23209
Name: user_type, dtype: object


In [5]:
# Convert user_type from integer to category
ride_sharing['user_type_cat'] = ride_sharing['user_type'].astype('category')

# Write an assert statement confirming the change
assert ride_sharing['user_type_cat'].dtype =='category'

# Print new summary statistics
print(ride_sharing['user_type_cat'].describe())

count          25760
unique             2
top       Subscriber
freq           23209
Name: user_type_cat, dtype: object


### Summing strings and concatenating numbers


In [6]:
# Strip duration of minutes
ride_sharing['duration_trim'] = ride_sharing['duration'].str.strip('minutes')

# convert duration to integer
ride_sharing['duration_time'] = ride_sharing['duration_trim'].astype('int')

# Write an assert statement making sure of conversion
assert ride_sharing['duration_time'].dtype == 'int'

# Print formed columns and calculate average ride duration
print(ride_sharing[['duration', 'duration_trim', 'duration_time']])
print(ride_sharing['duration_time'].mean())

         duration duration_trim  duration_time
0      12 minutes           12              12
1      24 minutes           24              24
2       8 minutes            8               8
3       4 minutes            4               4
4      11 minutes           11              11
...           ...           ...            ...
25755  11 minutes           11              11
25756  10 minutes           10              10
25757  14 minutes           14              14
25758  14 minutes           14              14
25759  29 minutes           29              29

[25760 rows x 3 columns]
11.389052795031056


## Data range constraints


### Tire size constraints


In [7]:
# convert tire_sizes to integer
ride_sharing['tire_sizes'] = ride_sharing['tire_sizes'].astype('int')

# Set all values above 27 to 27
ride_sharing.loc[ride_sharing['tire_sizes'] > 27, 'tire_sizes'] = 27

# Reconvert tire_sizes back to categorical
ride_sharing['tire_sizes'] = ride_sharing['tire_sizes'].astype('category')

# Print tire size description
print(ride_sharing['tire_sizes'].describe())

count     25760
unique        2
top          27
freq      13274
Name: tire_sizes, dtype: int64


### Back to the future

In [8]:
# No date for tire_sizes: fill it randomly
sz = len(ride_sharing['duration_time'])
ride_sharing['ride_date']  = np.random.choice(
    a=['2020-01-01', '2021-01-01'],  
    size=sz,  
    p=[0.9, 0.1]  
 )

In [9]:
# Convert ride_date to datetime
ride_sharing['ride_dt'] = pd.to_datetime(ride_sharing['ride_date'])

# Save today`s date
today = dt.date.today()

# Set all in the future to today's date
ride_sharing.loc[ride_sharing['ride_dt'] > pd.to_datetime('today'), 'ride_dt'] = pd.to_datetime('today')

# Print maximum of ride_dt column
print(ride_sharing['ride_dt'].max())

2020-05-14 20:58:53.135499


## Uniqueness constraints


### Finding duplicates


In [10]:
# Find duplicates
duplicates = ride_sharing.duplicated(subset='bike_id', keep=False)

# Sort your duplicated rides
duplicated_rides = ride_sharing[duplicates].sort_values('bike_id')

# Print relevant columns of duplicated_rides
print(duplicated_rides[['bike_id', 'duration', 'user_birth_year']])

       bike_id    duration  user_birth_year
3638        11  12 minutes             1988
6088        11   5 minutes             1985
10857       11   4 minutes             1987
10045       27  13 minutes             1989
16104       27  10 minutes             1970
...        ...         ...              ...
8812      6638  10 minutes             1986
6815      6638   5 minutes             1995
8456      6638   7 minutes             1983
8300      6638   6 minutes             1962
8380      6638   8 minutes             1984

[25717 rows x 3 columns]


In [11]:
ride_sharing.columns

Index(['duration', 'station_A_id', 'station_A_name', 'station_B_id',
       'station_B_name', 'bike_id', 'user_type', 'user_birth_year',
       'user_gender', 'tire_sizes', 'user_type_cat', 'duration_trim',
       'duration_time', 'ride_date', 'ride_dt'],
      dtype='object')

### Treating duplicates


In [12]:
# Drop complete duplicates from ride_sharing
ride_dup = ride_sharing.drop_duplicates().copy()

ride_dup['duration'] = ride_dup['duration'].str.replace(" minutes", "").astype('int')

# Create statistics dictionary for aggregation function
statistics = {'user_birth_year': 'min', 'duration': 'mean'}

# Group by ride_id and compute new statistics
ride_unique = ride_dup.groupby('bike_id').agg(statistics).reset_index()

# Find duplicated values again
duplicates = ride_unique.duplicated(subset = 'bike_id', keep = False)
duplicated_rides = ride_unique[duplicates == True]

# Assert duplicates are processed
assert duplicated_rides.shape[0] == 0