# Boston Marathon - Data Cleaning

In [1]:
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy_utils import create_database, database_exists
import warnings
warnings.filterwarnings('ignore')

from config import usr, pwd, url, port, db, table

## Load dataset
Load the dataset into a Pandas `DataFrame`.

In [2]:
df = pd.read_csv('./data/results_2014.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31984 entries, 0 to 31983
Data columns (total 21 columns):
10k          31984 non-null object
name         31984 non-null object
division     31984 non-null int64
25k          31984 non-null object
gender       31984 non-null object
age          31984 non-null int64
official     31984 non-null float64
bib          31984 non-null object
genderdiv    31984 non-null int64
ctz          1244 non-null object
35k          31984 non-null object
overall      31984 non-null int64
pace         31984 non-null float64
state        29408 non-null object
30k          31984 non-null object
5k           31984 non-null object
half         31984 non-null object
20k          31984 non-null object
country      31984 non-null object
city         31983 non-null object
40k          31984 non-null object
dtypes: float64(2), int64(4), object(15)
memory usage: 5.1+ MB


## Missing Values
### Distances
As seen in the above table, many of the distance columns are of type object, meaning they are strings. If we examine the unique values in one of the distance columns, we can see why.

In [4]:
find_filler_string = np.unique([x for x in df['half'].unique() if '.' not in x])
filler_string = find_filler_string[0]
print(f'Filler string: {filler_string}')

Filler string: -


There is the answer! Not all of the runners have recorded every split. Hence, a dash has been added to indicate that a split was not recorded for that runner. But, I would like to be able to deal with numbers instead of strings during the modeling process. So, I will first replace the missing values with nans, convert the columns to floats, and interpolate across the splits.

In [5]:
split_columns = ['5k', '10k', '20k', 'half', '25k', '30k', '35k', '40k']
for split in split_columns:
    dashes_list = [x for x in df[split] if '.' not in x]
    print(f'There are {len(dashes_list)} missing values for {split} split.')

There are 52 missing values for 5k split.
There are 50 missing values for 10k split.
There are 51 missing values for 20k split.
There are 72 missing values for half split.
There are 216 missing values for 25k split.
There are 40 missing values for 30k split.
There are 34 missing values for 35k split.
There are 39 missing values for 40k split.


In [6]:
df_clean = df.copy()

In [7]:
df_clean.loc[:, split_columns] = df_clean.loc[:, split_columns].replace(filler_string, np.nan)

In [8]:
df_clean.loc[:, split_columns] = df_clean.loc[:, split_columns].astype('float')

In [9]:
df_clean['0k'] = [0.0 for _ in range(len(df_clean))]

In [10]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31984 entries, 0 to 31983
Data columns (total 22 columns):
10k          31934 non-null float64
name         31984 non-null object
division     31984 non-null int64
25k          31768 non-null float64
gender       31984 non-null object
age          31984 non-null int64
official     31984 non-null float64
bib          31984 non-null object
genderdiv    31984 non-null int64
ctz          1244 non-null object
35k          31950 non-null float64
overall      31984 non-null int64
pace         31984 non-null float64
state        29408 non-null object
30k          31944 non-null float64
5k           31932 non-null float64
half         31912 non-null float64
20k          31933 non-null float64
country      31984 non-null object
city         31983 non-null object
40k          31945 non-null float64
0k           31984 non-null float64
dtypes: float64(11), int64(4), object(7)
memory usage: 5.4+ MB


In [11]:
split_columns.insert(0, '0k')

In [12]:
split_columns

['0k', '5k', '10k', '20k', 'half', '25k', '30k', '35k', '40k']

In [13]:
string_to_float = {'0k': 0.0, '5k': 5.0, '10k': 10.0, '20k': 20.0, 'half': 21.1, '25k': 25.0, '30k': 30.0, '35k': 35.0, '40k': 40.0}

In [14]:
df_clean.rename(columns=string_to_float, inplace=True)

In [15]:
float_distances = [0.0, 5.0, 10.0, 20.0, 21.1, 25.0, 30.0, 35.0, 40.0]
print(df_clean.loc[:, float_distances].head())

   0.0    5.0    10.0   20.0   21.1   25.0   30.0    35.0    40.0
0   0.0   8.02  17.37  37.65  39.72  47.67  59.18   71.40   80.43
1   0.0  16.22  32.58  65.83  69.47  82.43  99.33  116.37  132.10
2   0.0   7.75  16.62  36.10  38.03  45.80  56.45   67.42   76.10
3   0.0  16.20  32.57  65.83  69.47  82.43  99.33  116.37  132.95
4   0.0   8.02  17.12  36.58  38.60  46.37  57.03   67.83   76.72


In [16]:
df_clean.loc[:, float_distances].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31984 entries, 0 to 31983
Data columns (total 9 columns):
0.0     31984 non-null float64
5.0     31932 non-null float64
10.0    31934 non-null float64
20.0    31933 non-null float64
21.1    31912 non-null float64
25.0    31768 non-null float64
30.0    31944 non-null float64
35.0    31950 non-null float64
40.0    31945 non-null float64
dtypes: float64(9)
memory usage: 2.2 MB


In [17]:
df_clean.loc[:, float_distances] = df_clean.loc[:, float_distances].interpolate(axis=1)

In [18]:
df_clean.loc[:, float_distances].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31984 entries, 0 to 31983
Data columns (total 9 columns):
0.0     31984 non-null float64
5.0     31984 non-null float64
10.0    31984 non-null float64
20.0    31984 non-null float64
21.1    31984 non-null float64
25.0    31984 non-null float64
30.0    31984 non-null float64
35.0    31984 non-null float64
40.0    31984 non-null float64
dtypes: float64(9)
memory usage: 2.2 MB


Drop column 0.0 as I don't need it anymore. It was only used for interpolation.

In [19]:
df_clean.drop(0.0, axis=1, inplace=True)

In [20]:
df_clean.loc[:, float_distances[1:]].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
5.0,31984.0,25.760031,4.334164,7.75,22.65,25.1,28.48,86.95
10.0,31984.0,51.819779,8.999654,16.62,45.4,50.35,57.18,112.38
20.0,31984.0,105.702704,19.807137,36.1,91.73,102.13,116.52,224.35
21.1,31984.0,111.676693,21.069342,38.03,96.85,107.85,123.08,236.67
25.0,31984.0,134.186245,26.623735,45.8,115.55,129.05,148.285,289.02
30.0,31984.0,164.59946,33.996376,47.95,140.67,157.77,182.9725,376.38
35.0,31984.0,196.857142,41.993665,47.95,167.13,188.175,220.18,449.33
40.0,31984.0,229.078324,49.280616,47.95,193.82,218.88,257.555,508.35


### Citizenship and State
The only other two columns with missing values are _ctz_ and _state_, which are the participant's citizenship and state, respectively. I will replace the missing values for these columns with _unknown_.

In [21]:
df_clean['ctz'].unique()

array([nan, 'KOR', 'MEX', 'ECU', 'CPV', 'AUS', 'CAN', 'NZL', 'ROU', 'GER',
       'DEN', 'GBR', 'FRA', 'IRL', 'JPN', 'RSA', 'ITA', 'GUA', 'ESP',
       'CHN', 'LAT', 'ARG', 'TUR', 'BRA', 'KEN', 'CRC', 'RUS', 'NED',
       'USA', 'BIH', 'PHI', 'SUI', 'SRI', 'MAR', 'SWE', 'SVK', 'VEN',
       'HUN', 'NEP', 'IND', 'PER', 'GEO', 'DOM', 'CMR', 'COL', 'NOR',
       'TRI', 'AUT', 'KSA', 'TPE', 'HKG', 'ERI', 'SLO', 'PAR', 'CZE',
       'ZIM', 'UKR', 'BEL', 'POR', 'MLT', 'MAS', 'FIN', 'ALG', 'IRI',
       'NCA', 'ETH', 'ISR', 'ISL', 'CHI', 'ESA', 'BLR', 'SIN', 'HAI',
       'POL', 'GUY', 'GRE', 'IRQ', 'BER', 'PAK', 'URU', 'UGA', 'CRO',
       'PAN', 'BAR', 'LIB'], dtype=object)

In [22]:
df_clean['state'].unique()

array([nan, 'CA', 'ON', 'MI', 'QC', 'IL', 'OR', 'AZ', 'CO', 'VA', 'GA',
       'NJ', 'ME', 'MA', 'UT', 'NY', 'MO', 'RI', 'NH', 'MT', 'KY', 'FL',
       'TN', 'AB', 'WA', 'WI', 'DC', 'PA', 'OH', 'IN', 'NS', 'MD', 'IA',
       'MN', 'CT', 'SD', 'TX', 'DE', 'LA', 'NC', 'OK', 'NV', 'AL', 'KS',
       'NE', 'MB', 'HI', 'AK', 'NM', 'BC', 'VT', 'ID', 'SC', 'WV', 'PE',
       'ND', 'MS', 'AR', 'SK', 'NT', 'PR', 'WY', 'NB', 'YT', 'NL', 'AE',
       'AA', 'AP', 'GU'], dtype=object)

In [23]:
df_clean['ctz'].fillna('unknown', inplace=True)

In [24]:
df_clean['state'].fillna('unknown', inplace=True)

In [25]:
df_clean['ctz'].unique()

array(['unknown', 'KOR', 'MEX', 'ECU', 'CPV', 'AUS', 'CAN', 'NZL', 'ROU',
       'GER', 'DEN', 'GBR', 'FRA', 'IRL', 'JPN', 'RSA', 'ITA', 'GUA',
       'ESP', 'CHN', 'LAT', 'ARG', 'TUR', 'BRA', 'KEN', 'CRC', 'RUS',
       'NED', 'USA', 'BIH', 'PHI', 'SUI', 'SRI', 'MAR', 'SWE', 'SVK',
       'VEN', 'HUN', 'NEP', 'IND', 'PER', 'GEO', 'DOM', 'CMR', 'COL',
       'NOR', 'TRI', 'AUT', 'KSA', 'TPE', 'HKG', 'ERI', 'SLO', 'PAR',
       'CZE', 'ZIM', 'UKR', 'BEL', 'POR', 'MLT', 'MAS', 'FIN', 'ALG',
       'IRI', 'NCA', 'ETH', 'ISR', 'ISL', 'CHI', 'ESA', 'BLR', 'SIN',
       'HAI', 'POL', 'GUY', 'GRE', 'IRQ', 'BER', 'PAK', 'URU', 'UGA',
       'CRO', 'PAN', 'BAR', 'LIB'], dtype=object)

In [26]:
df_clean['state'].unique()

array(['unknown', 'CA', 'ON', 'MI', 'QC', 'IL', 'OR', 'AZ', 'CO', 'VA',
       'GA', 'NJ', 'ME', 'MA', 'UT', 'NY', 'MO', 'RI', 'NH', 'MT', 'KY',
       'FL', 'TN', 'AB', 'WA', 'WI', 'DC', 'PA', 'OH', 'IN', 'NS', 'MD',
       'IA', 'MN', 'CT', 'SD', 'TX', 'DE', 'LA', 'NC', 'OK', 'NV', 'AL',
       'KS', 'NE', 'MB', 'HI', 'AK', 'NM', 'BC', 'VT', 'ID', 'SC', 'WV',
       'PE', 'ND', 'MS', 'AR', 'SK', 'NT', 'PR', 'WY', 'NB', 'YT', 'NL',
       'AE', 'AA', 'AP', 'GU'], dtype=object)

In [27]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31984 entries, 0 to 31983
Data columns (total 21 columns):
10.0         31984 non-null float64
name         31984 non-null object
division     31984 non-null int64
25.0         31984 non-null float64
gender       31984 non-null object
age          31984 non-null int64
official     31984 non-null float64
bib          31984 non-null object
genderdiv    31984 non-null int64
ctz          31984 non-null object
35.0         31984 non-null float64
overall      31984 non-null int64
pace         31984 non-null float64
state        31984 non-null object
30.0         31984 non-null float64
5.0          31984 non-null float64
21.1         31984 non-null float64
20.0         31984 non-null float64
country      31984 non-null object
city         31983 non-null object
40.0         31984 non-null float64
dtypes: float64(10), int64(4), object(7)
memory usage: 5.1+ MB


## Save the dataset
Now that the dataset is clean and the missing values have been dealt with, I will save it to a SQL database.

In [28]:
# Create the database if it doesn't exist
db_url = f"postgresql+psycopg2://{usr}:{pwd}@{url}:{port}/{db}"
if database_exists(db_url):
    pass
else:
    create_database(db_url)

In [29]:
engine = create_engine(f"postgresql+psycopg2://{usr}:{pwd}@{url}:{port}/{db}")
df_clean.to_sql(name=table, con=engine, index=False, if_exists='replace')
engine.dispose()