# Task 1: Data Preparation

### Preliminaries
Import necessary modules.

In [2]:
# Module imports
import warnings
import pandas as pd
import numpy as np
# Disable warnings and allow large columns
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

### Import data
Load data into notebook for subsequent investigation.

In [15]:
df = pd.read_csv('https://raw.githubusercontent.com/floeck/nba-analysis/master/NBA_players_stats.csv', sep = ',')

Take a look to see it has imported correctly.

In [22]:
df.sample(n=5, random_state=9)

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
249,244,Cory Joseph,PG,29,SAC,36,2,749,90,201,0.448,24,75,0.32,66,126,0.524,27,35,0.771,20,58,78,85,24,7,36,83,231
411,396,Mitchell Robinson,C,22,NYK,27,27,778,103,156,0.66,0,0,,103,156,0.66,22,46,0.478,97,129,226,14,32,41,22,78,228
452,433,Garrett Temple,SG,34,CHI,33,12,936,107,250,0.428,45,131,0.344,62,119,0.521,31,38,0.816,20,87,107,68,31,18,33,77,290
200,197,Willy Hernangómez,C,26,NOP,20,2,359,60,110,0.545,1,5,0.2,59,105,0.562,24,36,0.667,53,86,139,26,7,10,11,27,145
340,331,Larry Nance Jr.,PF,28,CLE,19,18,635,70,145,0.483,24,63,0.381,46,82,0.561,12,21,0.571,27,102,129,61,37,10,28,41,176


Looks imported correctly, so now we're going to go ahead and examine the data frame.

### Data examination & cleaning

Now to examine the shape of the dataframe and detect the presence of any null values. We will also look at summary statistics of the features.

In [20]:
print(f"Shape of the dataset is {df.shape} \n")
print(df.dtypes)

Shape of the dataset is (512, 29) 

Rk          int64
Player     object
Pos        object
Age         int64
Tm         object
G           int64
GS          int64
MP          int64
FG          int64
FGA         int64
FG%       float64
3P          int64
3PA         int64
3P%       float64
2P          int64
2PA         int64
2P%       float64
FT          int64
FTA         int64
FT%       float64
ORB         int64
DRB         int64
TRB         int64
AST         int64
STL         int64
BLK         int64
TOV         int64
PF          int64
PTS         int64
dtype: object


In [21]:
df.isnull().sum()

Rk         0
Player     0
Pos        0
Age        0
Tm         0
G          0
GS         0
MP         0
FG         0
FGA        0
FG%        3
3P         0
3PA        0
3P%       33
2P         0
2PA        0
2P%        7
FT         0
FTA        0
FT%       32
ORB        0
DRB        0
TRB        0
AST        0
STL        0
BLK        0
TOV        0
PF         0
PTS        0
dtype: int64

In [25]:
df.describe(include='object')

Unnamed: 0,Player,Pos,Tm
count,512,512,512
unique,492,20,55
top,James Harden,SG,BRK
freq,3,117,19


In [27]:
df.describe(include='float64')

Unnamed: 0,FG%,3P%,2P%,FT%
count,509.0,479.0,505.0,480.0
mean,0.439961,0.316823,0.501632,0.742929
std,0.120485,0.140463,0.143022,0.171721
min,0.0,0.0,0.0,0.0
25%,0.391,0.2855,0.444,0.667
50%,0.442,0.349,0.51,0.772
75%,0.5,0.394,0.575,0.851
max,1.0,1.0,1.0,1.0


In [28]:
df.describe(include='int64')

Unnamed: 0,Rk,Age,G,GS,MP,FG,FGA,3P,3PA,2P,2PA,FT,FTA,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
count,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0
mean,248.046875,26.025391,22.515625,10.603516,512.730469,87.326172,187.789062,27.21875,73.910156,60.107422,113.878906,36.658203,47.070312,20.808594,73.439453,94.248047,52.925781,16.050781,10.509766,28.810547,41.898438,333.275391
std,141.539473,12.099932,10.762502,12.697945,374.173372,82.210894,170.428063,29.654749,75.700355,63.802427,116.742818,45.406517,56.053013,24.399771,66.360902,86.459582,60.98396,13.501952,13.058198,28.076169,31.06129,1550.220911
min,1.0,-19.0,1.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,126.75,22.0,14.0,0.0,147.75,18.0,43.0,3.0,9.0,10.0,22.0,6.0,8.0,4.0,18.75,23.0,9.0,4.0,2.0,7.0,14.0,50.0
50%,250.5,25.0,25.0,3.0,497.0,64.5,145.0,17.0,50.5,39.0,75.0,21.0,29.0,13.0,57.5,73.0,35.0,14.0,6.0,22.0,39.5,180.0
75%,370.25,28.0,33.0,20.0,814.25,131.25,275.0,45.0,119.25,88.25,171.0,48.0,63.25,27.0,110.0,140.0,69.0,25.0,14.0,40.25,64.0,356.75
max,492.0,280.0,38.0,37.0,1358.0,379.0,764.0,169.0,411.0,329.0,543.0,298.0,363.0,159.0,357.0,472.0,345.0,63.0,111.0,149.0,228.0,28800.0


Here is a summary of the information gathered above.

* The shape (512, 92) matches the assignment spec.

* There are null values detected in the columns of `FG%`, `3P%`, `2P%` and `FT%`. 

* In `objects` description, although the feature players looks suitable, `Pos` and `Tm` do not. As per the spec, there should only be 7 unique positions, whereas 20 are detected. Additonally, there should only be 31 teams, whereas 55 are detected.

* In `float64` description, counts and values look ok due to presence of null values detected previously.

* In `int64` description, there are a few curious values. Min age is -19, and max is 280, both of which are illogical. Additionally, looking at `max` of each, something else comes up. Max of `PF` is 228, which is unlikely given that would mean someone had fouled a maximum 6 times over every 38 games (6*38 = 228). Also, max of `PTS` is 28800, which is over the max of 2000 as per the spec (..and logic).


* The min age of players is -19, and max is 280, which is illogical.


In [31]:
# Clean the data
teams = ['MIA', 'MIL', 'NOP', 'SAS', 'PHO', 'MEM', 'TOT', 'BRK', 'CLE', 'ORL', 'LAL', 'POR', 'TOR', 'CHI', 'WAS', 'UTA', 'SAC', 'CHO', 'NYK', 'DEN', 'LAC', 'GSW', 'OKC', 'MIN', 'DET', 'DAL', 'IND', 'ATL', 'PHI', 'BOS', 'HOU']
positions = ['PF', 'PG', 'C', 'SG', 'SF', 'PG-SG', 'SF-PF']

In [30]:
df.to_csv('cleaned_NBA_players_stats.csv', index=False)

# Task 2: Data Exploration

## Task 2.1 
Explore the players' total points: Please analyze the composition of the total points of the top five players with the most points.

In [None]:
# Code goes after this line by adding cells


## Task 2.2 
Assuming that the data collector makes an entry error when collecting data, it can be ensured that the error occurred in the 3P, 3PA and 3P% columns, but it is not sure which player's information the error lies on. Please try to explore the error by visualization to identify how many errors there are and try to fix it.


In [None]:
# Code goes after this line by adding cells


## Task 2.3 
Please analyze the relationship between the player's total points and the rest features (columns). Please use at least three other columns.


In [29]:
# Code goes after this line by adding cells
df.corr(method='spearman')

Unnamed: 0,Rk,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
Rk,1.0,-0.025736,-0.058665,-0.073919,-0.058704,-0.042869,-0.041182,-0.041641,-0.033817,-0.024668,-0.015127,-0.031083,-0.025877,-0.011532,-0.049495,-0.049085,-0.000862,-0.061588,-0.073015,-0.070758,-0.037472,-0.019495,-0.026091,-0.020275,-0.016473,-0.064446
Age,-0.025736,1.0,0.211046,0.235098,0.254296,0.235297,0.235538,0.086634,0.249994,0.245656,0.157331,0.19271,0.191151,0.073318,0.227811,0.215865,0.166031,0.16985,0.247201,0.233963,0.305442,0.253925,0.151144,0.234701,0.198073,0.245981
G,-0.058665,0.211046,1.0,0.646779,0.892085,0.820286,0.818044,0.296663,0.657724,0.669796,0.221392,0.761974,0.763834,0.210734,0.728512,0.733142,0.148403,0.721925,0.823301,0.818652,0.739162,0.789995,0.621862,0.792014,0.868279,0.806848
GS,-0.073919,0.235098,0.646779,1.0,0.838896,0.804722,0.800294,0.301045,0.610883,0.623465,0.217764,0.767841,0.772153,0.181397,0.755797,0.764316,0.103087,0.655145,0.784936,0.77052,0.737407,0.73514,0.609741,0.783616,0.750188,0.795295
MP,-0.058704,0.254296,0.892085,0.838896,1.0,0.956823,0.957656,0.328799,0.765731,0.779545,0.270529,0.894932,0.902947,0.213845,0.874162,0.873783,0.215014,0.759391,0.91219,0.895631,0.886228,0.898916,0.688493,0.922189,0.901741,0.945236
FG,-0.042869,0.235297,0.820286,0.804722,0.956823,1.0,0.989092,0.41626,0.750832,0.760617,0.284226,0.961354,0.964981,0.263431,0.921898,0.921168,0.229925,0.743591,0.89754,0.880344,0.879445,0.859383,0.680662,0.931424,0.857073,0.984668
FGA,-0.041182,0.235538,0.818044,0.800294,0.957656,0.989092,1.0,0.297593,0.808104,0.824525,0.286331,0.928662,0.947174,0.163168,0.910947,0.902797,0.269971,0.694198,0.873924,0.847958,0.895767,0.868907,0.637264,0.92902,0.840622,0.981199
FG%,-0.041641,0.086634,0.296663,0.301045,0.328799,0.41626,0.297593,1.0,0.010235,-0.031881,0.199141,0.522078,0.434011,0.786804,0.389451,0.427114,-0.139483,0.538483,0.431531,0.475298,0.218675,0.256539,0.479711,0.32979,0.389438,0.373086
3P,-0.033817,0.249994,0.657724,0.610883,0.765731,0.750832,0.808104,0.010235,1.0,0.989232,0.578311,0.578392,0.613192,-0.02996,0.651754,0.61167,0.415511,0.35257,0.6065,0.548572,0.733497,0.703218,0.345979,0.679928,0.605044,0.771696
3PA,-0.024668,0.245656,0.669796,0.623465,0.779545,0.760617,0.824525,-0.031881,0.989232,1.0,0.474294,0.595546,0.631257,-0.028416,0.661871,0.6227,0.397979,0.362821,0.617054,0.559453,0.747734,0.721181,0.352366,0.696665,0.619383,0.780614
