In [14]:
import pandas as pd

In [15]:
df = pd.read_csv('athletes.csv')

In [16]:
#---------------------------------------------------------------------------------------------------------------------

In [17]:
# STEP 1: CLEAN THE DATA

In [18]:
#Check data for missing columns (height, weight and info have missing values)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11538 entries, 0 to 11537
Data columns (total 12 columns):
id               11538 non-null int64
name             11538 non-null object
nationality      11538 non-null object
sex              11538 non-null object
date_of_birth    11538 non-null object
height           11208 non-null float64
weight           10879 non-null float64
sport            11538 non-null object
gold             11538 non-null int64
silver           11538 non-null int64
bronze           11538 non-null int64
info             131 non-null object
dtypes: float64(2), int64(4), object(6)
memory usage: 1.1+ MB


In [19]:
#Drop insignificant columns or columns with so little data they won't be of help to us
df.drop(['info', 'gold', 'silver', 'bronze'], inplace=True, axis=1)
# Change the current 'df' object and look for our drop columns in the first row/axis=1

In [20]:
#We want to compare 'similar' entries based on height, weight

df = df.loc[df['sex'] == 'male']

options = ['basketball', 'football', 'golf', 'cycling', 'rugby sevens']
df = df.loc[df['sport'].isin(options)]

In [21]:
df

Unnamed: 0,id,name,nationality,sex,date_of_birth,height,weight,sport
4,33922579,Aaron Gate,NZL,male,1990-11-26,1.81,71.0,cycling
17,256673338,Abbubaker Mobara,RSA,male,1994-02-18,1.75,64.0,football
27,677622742,Abdelghani Demmou,ALG,male,1989-01-29,1.85,75.0,football
29,904808208,Abdelhakim Amokrane,ALG,male,1994-05-10,1.86,70.0,football
31,133974151,Abdelkadir Salhi,ALG,male,1993-03-19,1.85,79.0,football
38,514096508,Abdelraouf Benguit,ALG,male,1996-04-05,1.70,65.0,football
40,285603057,Abderrahmane Mansouri,ALG,male,1995-01-13,1.72,66.0,cycling
41,545134894,Abderrahmane Meziane,ALG,male,1994-03-07,1.68,62.0,football
49,958967643,Abdul Khalili,SWE,male,1992-06-07,1.81,71.0,football
57,153457,Abdullahi Shehu,NGR,male,1993-03-12,1.70,,football


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1029 entries, 4 to 11476
Data columns (total 8 columns):
id               1029 non-null int64
name             1029 non-null object
nationality      1029 non-null object
sex              1029 non-null object
date_of_birth    1029 non-null object
height           1004 non-null float64
weight           982 non-null float64
sport            1029 non-null object
dtypes: float64(2), int64(1), object(5)
memory usage: 72.4+ KB


In [23]:
#Fill in missing values for height and weight with the mean height and weight
avg_height = df['height'].mean()
avg_weight = df['weight'].mean()
print(avg_height)
print(avg_weight)

df['height'].fillna(avg_height, inplace=True)
df['weight'].fillna(avg_weight, inplace=True)

1.8289840637450168
80.31059063136456


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1029 entries, 4 to 11476
Data columns (total 8 columns):
id               1029 non-null int64
name             1029 non-null object
nationality      1029 non-null object
sex              1029 non-null object
date_of_birth    1029 non-null object
height           1029 non-null float64
weight           1029 non-null float64
sport            1029 non-null object
dtypes: float64(2), int64(1), object(5)
memory usage: 72.4+ KB


In [25]:
#Change date of birth to current age
df['date_of_birth'] = pd.to_datetime(df['date_of_birth'])

#NOTE:
# - Handle outliers

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1029 entries, 4 to 11476
Data columns (total 8 columns):
id               1029 non-null int64
name             1029 non-null object
nationality      1029 non-null object
sex              1029 non-null object
date_of_birth    1029 non-null datetime64[ns]
height           1029 non-null float64
weight           1029 non-null float64
sport            1029 non-null object
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 72.4+ KB


In [27]:
df['age'] = df['date_of_birth'].apply(lambda x : (pd.datetime.now().year - x.year))

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1029 entries, 4 to 11476
Data columns (total 9 columns):
id               1029 non-null int64
name             1029 non-null object
nationality      1029 non-null object
sex              1029 non-null object
date_of_birth    1029 non-null datetime64[ns]
height           1029 non-null float64
weight           1029 non-null float64
sport            1029 non-null object
age              1029 non-null int64
dtypes: datetime64[ns](1), float64(2), int64(2), object(4)
memory usage: 80.4+ KB


In [29]:
df.drop(['date_of_birth'], inplace=True, axis=1)

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1029 entries, 4 to 11476
Data columns (total 8 columns):
id             1029 non-null int64
name           1029 non-null object
nationality    1029 non-null object
sex            1029 non-null object
height         1029 non-null float64
weight         1029 non-null float64
sport          1029 non-null object
age            1029 non-null int64
dtypes: float64(2), int64(2), object(4)
memory usage: 72.4+ KB


In [None]:
#---------------------------------------------------------------------------------------------------------------------

In [None]:
# STEP 2: Fit Our Model

In [31]:
from sklearn.model_selection import train_test_split
#Train and Test data sets
train, test = train_test_split(df, test_size=0.10)

In [32]:
train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 926 entries, 7692 to 9636
Data columns (total 8 columns):
id             926 non-null int64
name           926 non-null object
nationality    926 non-null object
sex            926 non-null object
height         926 non-null float64
weight         926 non-null float64
sport          926 non-null object
age            926 non-null int64
dtypes: float64(2), int64(2), object(4)
memory usage: 65.1+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 103 entries, 11422 to 4
Data columns (total 8 columns):
id             103 non-null int64
name           103 non-null object
nationality    103 non-null object
sex            103 non-null object
height         103 non-null float64
weight         103 non-null float64
sport          103 non-null object
age            103 non-null int64
dtypes: float64(2), int64(2), object(4)
memory usage: 7.2+ KB


In [33]:
#Separate our variables into 'attributes' (x) and 'labels' (y)
x_train = train[['height', 'weight']]
y_train = train.loc[:,'sport']

x_test = test[['height', 'weight']]
y_test = test.loc[:,'sport']

In [34]:
#Scale the height, weight and age values so they are proportinal to one another

#Here we switch from pandas to numpy! Easier for sklearn to use

from sklearn.preprocessing import MinMaxScaler


scaler = MinMaxScaler()

x_train = scaler.fit_transform(x_train[['height', 'weight']])
x_test = scaler.fit_transform(x_test[['height', 'weight']])

#Conver from Pandas dataframe to numpy array (older version)
y_train = y_train.values
y_test = y_test.values

In [35]:
x_train

array([[0.        , 0.09411765],
       [0.34482759, 0.2       ],
       [0.51724138, 0.32941176],
       ...,
       [0.34482759, 0.22352941],
       [0.34482759, 0.42352941],
       [0.24137931, 0.15294118]])

In [36]:
y_train

array(['football', 'cycling', 'football', 'basketball', 'cycling',
       'cycling', 'cycling', 'basketball', 'cycling', 'cycling',
       'basketball', 'cycling', 'cycling', 'basketball', 'cycling',
       'cycling', 'cycling', 'cycling', 'football', 'golf', 'football',
       'rugby sevens', 'football', 'rugby sevens', 'football', 'cycling',
       'cycling', 'golf', 'football', 'cycling', 'football', 'cycling',
       'basketball', 'cycling', 'cycling', 'cycling', 'football',
       'rugby sevens', 'rugby sevens', 'golf', 'basketball', 'cycling',
       'football', 'basketball', 'football', 'basketball', 'rugby sevens',
       'football', 'football', 'cycling', 'rugby sevens', 'football',
       'football', 'football', 'football', 'basketball', 'cycling',
       'cycling', 'rugby sevens', 'cycling', 'cycling', 'rugby sevens',
       'football', 'cycling', 'cycling', 'football', 'cycling', 'cycling',
       'football', 'basketball', 'cycling', 'rugby sevens', 'football',
       'rugb

In [37]:
from sklearn.neighbors import KNeighborsClassifier

classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [None]:
#---------------------------------------------------------------------------------------------------------------------

In [38]:
# STEP 3: Make predictions using our train data
prediction = classifier.predict(x_test)

In [None]:
#---------------------------------------------------------------------------------------------------------------------

In [39]:
# STEP 4: Test our predictions against our test data
from sklearn.metrics import classification_report

print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

  basketball       0.89      0.62      0.73        13
     cycling       0.44      0.52      0.48        29
    football       0.54      0.58      0.56        38
        golf       0.00      0.00      0.00         4
rugby sevens       0.67      0.63      0.65        19

 avg / total       0.56      0.55      0.55       103



In [None]:
#---------------------------------------------------------------------------------------------------------------------

In [44]:
# STEP #5: Make our own predictions
scaler = MinMaxScaler()
new_prediction = scaler.fit_transform([[170, 120]])
result = classifier.predict(new_prediction)

In [45]:
print(result)

['cycling']
