# Isolation Forest - Outlier Detection

## 1. Allstar Players

In [1]:
from sklearn.ensemble import IsolationForest
import pandas as pd
import matplotlib.pyplot as plt

csv_file_path = '/Users/Pashlene/Desktop/Modified_player_allstar.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)
df.fillna(0, inplace=True)
df.head()

Unnamed: 0,ilkid,year,firstname,lastname,conference,leag,gp,minutes,pts,dreb,...,stl,blk,turnover,pf,fga,fgm,fta,ftm,tpa,tpm
0,AbdulKa01,1978,Karem,Abdul-Jabbar,west,N,1,28,11.0,0.0,...,0.0,0.0,0.0,0.0,12.0,5.0,2.0,1.0,0.0,0.0
1,AbdulKa01,1969,Karem,Abdul-Jabbar,east,N,1,18,10.0,0.0,...,0.0,0.0,0.0,0.0,8.0,4.0,2.0,2.0,0.0,0.0
2,AbdulKa01,1988,Kareem,Abdul-Jabbar,west,N,1,13,4.0,0.0,...,0.0,0.0,0.0,0.0,6.0,1.0,2.0,2.0,0.0,0.0
3,AbdulKa01,1987,Kareem,Abdul-Jabbar,west,N,1,14,10.0,0.0,...,0.0,0.0,0.0,0.0,9.0,4.0,2.0,2.0,0.0,0.0
4,AbdulKa01,1986,Kareem,Abdul-Jabbar,west,N,1,27,10.0,0.0,...,0.0,0.0,0.0,0.0,9.0,4.0,2.0,2.0,0.0,0.0


In [2]:
performance_metrics = ['pts', 'oreb', 'dreb', 'asts', 'stl', 'blk']

# We will handle missing values by filling them with the median of the column
df_filled = df[performance_metrics].fillna(0)

iforest = IsolationForest(n_estimators = 100, contamination = 0.03, max_samples ='auto')
prediction = iforest.fit_predict(df_filled)
print(prediction)
print("Number of outliers detected: {}".format(prediction[prediction < 0].sum()))
print("Number of normal samples detected: {}".format(prediction[prediction > 0].sum()))
df['anomaly_score']=prediction

[1 1 1 ... 1 1 1]
Number of outliers detected: -44
Number of normal samples detected: 1418


In [10]:
df

Unnamed: 0,ilkid,year,firstname,lastname,conference,leag,gp,minutes,pts,dreb,...,blk,turnover,pf,fga,fgm,fta,ftm,tpa,tpm,anomaly_score
0,AbdulKa01,1978,Karem,Abdul-Jabbar,west,N,1,28,11.0,0.0,...,0.0,0.0,0.0,12.0,5.0,2.0,1.0,0.0,0.0,1
1,AbdulKa01,1969,Karem,Abdul-Jabbar,east,N,1,18,10.0,0.0,...,0.0,0.0,0.0,8.0,4.0,2.0,2.0,0.0,0.0,1
2,AbdulKa01,1988,Kareem,Abdul-Jabbar,west,N,1,13,4.0,0.0,...,0.0,0.0,0.0,6.0,1.0,2.0,2.0,0.0,0.0,1
3,AbdulKa01,1987,Kareem,Abdul-Jabbar,west,N,1,14,10.0,0.0,...,0.0,0.0,0.0,9.0,4.0,2.0,2.0,0.0,0.0,1
4,AbdulKa01,1986,Kareem,Abdul-Jabbar,west,N,1,27,10.0,0.0,...,0.0,0.0,0.0,9.0,4.0,2.0,2.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1457,YardlGe01,1956,George,Yardley,west,N,1,25,9.0,0.0,...,0.0,0.0,0.0,10.0,4.0,1.0,1.0,0.0,0.0,1
1458,YardlGe01,1955,George,Yardley,west,N,1,19,8.0,0.0,...,0.0,0.0,0.0,7.0,3.0,3.0,2.0,0.0,0.0,1
1459,YardlGe01,1954,George,Yardley,west,N,1,22,11.0,0.0,...,0.0,0.0,0.0,11.0,4.0,4.0,3.0,0.0,0.0,1
1460,YardlGe01,1959,George,Yardley,east,N,1,16,11.0,0.0,...,0.0,0.0,0.0,9.0,5.0,2.0,1.0,0.0,0.0,1


In [4]:
outstanding_players = df[df['anomaly_score'] == -1]
outstanding_players

Unnamed: 0,ilkid,year,firstname,lastname,conference,leag,gp,minutes,pts,dreb,...,blk,turnover,pf,fga,fgm,fta,ftm,tpa,tpm,anomaly_score
28,AllenRa02,1999,Ray,Allen,east,N,1,17,14.0,0.0,...,1.0,3.0,2.0,13.0,4.0,6.0,5.0,6.0,1.0,-1
81,BarryRi01,1971,Rick,Barry,east,A,1,26,4.0,8.0,...,0.0,1.0,2.0,10.0,2.0,1.0,0.0,0.0,0.0,-1
161,BriskJo01,1970,John,Brisker,east,A,1,27,15.0,11.0,...,0.0,2.0,3.0,19.0,5.0,7.0,5.0,1.0,0.0,-1
172,BryanKo01,2003,Kobe,Bryant,west,N,1,36,20.0,3.0,...,1.0,6.0,3.0,12.0,9.0,1.0,0.0,3.0,2.0,-1
173,BryanKo01,2002,Kobe,Bryant,west,N,1,36,22.0,5.0,...,2.0,5.0,5.0,17.0,8.0,6.0,3.0,5.0,3.0,-1
285,DanieMe01,1967,Mel,Daniels,east,A,1,29,22.0,7.0,...,0.0,3.0,1.0,18.0,9.0,11.0,4.0,0.0,0.0,-1
288,DanieMe01,1970,Mel,Daniels,west,A,1,30,29.0,10.0,...,0.0,4.0,3.0,19.0,12.0,7.0,5.0,0.0,0.0,-1
290,DanieMe01,1972,Mel,Daniels,west,A,1,33,25.0,7.0,...,2.0,3.0,3.0,19.0,8.0,12.0,9.0,0.0,0.0,-1
307,DavisAn01,2000,Antonio,Davis,east,N,1,20,8.0,2.0,...,1.0,0.0,0.0,11.0,4.0,0.0,0.0,0.0,0.0,-1
356,DuncaTi01,2002,Tim,Duncan,west,N,1,40,19.0,10.0,...,0.0,3.0,2.0,18.0,8.0,3.0,3.0,0.0,0.0,-1
