# DATA CLEANING

#### MODULES IMPORT

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math as math

#### RAW DATA IMPORT

In [5]:
df_2015 = pd.read_csv('./raw/marathon_results_2015.csv')
df_2016 = pd.read_csv('./raw/marathon_results_2016.csv')
df_2017 = pd.read_csv('./raw/marathon_results_2017.csv')

#### ADDING YEAR COLUMNS TO EACH DATASET

In [7]:
df_2015['Year'] = 2015
df_2016['Year'] = 2016
df_2017['Year'] = 2017

#### COMBINING IMPORTED DATASETS INTO ONE

In [9]:
df_all = pd.concat([df_2015, df_2016, df_2017])

#### SELECTING COLUMNS

In [11]:
df_all = df_all[['Name', 'M/F', 'Age', 'Country', 'City', 'Pace', '5K', '10K', '15K', '20K', '25K', '30K', '35K', '40K', 'Official Time', 'Year']]

#### CHANGING TYPE OF COLUMNS THAT ARE USING TIME VALUES

In [13]:
time_columns = ['Pace', '5K', '10K', '15K', '20K', '25K', '30K', '35K', '40K', 'Official Time']
df_all[time_columns] = df_all[time_columns].apply(lambda col: pd.to_timedelta(col, errors='coerce'))

#### CONVERTING MILES TO KILOMETERS IN 'Pace' COLUMN

In [15]:
df_all['Pace'] = df_all['Pace'] / 1.609344

#### SORTING DATAFRAME BY OFFICIAL TIME

In [17]:
df_all = df_all.sort_values(by='Official Time').reset_index(drop=True)

#### MAKING NEW 'RANK' COLUMN THAT REPRESENT RANK OF EACH RUNNER BY OFFICIAL TIME

In [19]:
df_all['Rank'] = pd.Series(df_all.index) + 1

#### CHANGING COLUMNS ORDER

In [21]:
df_all = df_all[['Name', 'M/F', 'Age', 'Country', 'City', 'Pace', '5K', '10K', '15K', '20K', '25K', '30K', '35K', '40K', 'Official Time', 'Rank', 'Year']]

#### CHECKING IF ALL GOOD

In [23]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79638 entries, 0 to 79637
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype          
---  ------         --------------  -----          
 0   Name           79638 non-null  object         
 1   M/F            79638 non-null  object         
 2   Age            79638 non-null  int64          
 3   Country        79638 non-null  object         
 4   City           79637 non-null  object         
 5   Pace           79638 non-null  timedelta64[ns]
 6   5K             79409 non-null  timedelta64[ns]
 7   10K            79524 non-null  timedelta64[ns]
 8   15K            79587 non-null  timedelta64[ns]
 9   20K            79553 non-null  timedelta64[ns]
 10  25K            79557 non-null  timedelta64[ns]
 11  30K            79550 non-null  timedelta64[ns]
 12  35K            79552 non-null  timedelta64[ns]
 13  40K            79562 non-null  timedelta64[ns]
 14  Official Time  79638 non-null  timedelta64[ns]
 15  Ra

In [24]:
df_all.head()

Unnamed: 0,Name,M/F,Age,Country,City,Pace,5K,10K,15K,20K,25K,30K,35K,40K,Official Time,Rank,Year
0,"Desisa, Lelisa",M,25,ETH,Ambo,0 days 00:03:03.925872902,0 days 00:14:43,0 days 00:29:43,0 days 00:44:57,0 days 01:00:29,0 days 01:16:07,0 days 01:32:00,0 days 01:47:59,0 days 02:02:39,0 days 02:09:17,1,2015
1,"Kirui, Geoffrey",M,24,KEN,Keringet,0 days 00:03:04.547244094,0 days 00:15:25,0 days 00:30:28,0 days 00:45:44,0 days 01:01:15,0 days 01:16:59,0 days 01:33:01,0 days 01:48:19,0 days 02:02:53,0 days 02:09:37,2,2017
2,"Tsegay, Yemane Adhane",M,30,ETH,Addis Ababa,0 days 00:03:05.168615286,0 days 00:14:43,0 days 00:29:43,0 days 00:44:58,0 days 01:00:28,0 days 01:16:07,0 days 01:31:59,0 days 01:47:59,0 days 02:02:42,0 days 02:09:48,3,2015
3,"Rupp, Galen",M,30,USA,Portland,0 days 00:03:05.168615286,0 days 00:15:24,0 days 00:30:27,0 days 00:45:44,0 days 01:01:15,0 days 01:16:59,0 days 01:33:01,0 days 01:48:19,0 days 02:03:14,0 days 02:09:58,4,2017
4,"Chebet, Wilson",M,29,KEN,Marakwet,0 days 00:03:05.789986478,0 days 00:14:43,0 days 00:29:43,0 days 00:44:57,0 days 01:00:29,0 days 01:16:07,0 days 01:32:00,0 days 01:47:59,0 days 02:03:01,0 days 02:10:22,5,2015


#### IMPORTING CLEAN DATASET TO NEW FILE

In [26]:
df_all.to_csv('boston_clean.csv', index=False)