#Local Outlier Factor - Player Regular Season

In [None]:
import pandas as pd
from sklearn.neighbors import LocalOutlierFactor
from sklearn.impute import SimpleImputer

# Load the player data
data_folder_path = "/content/unzipped_data"
file_path = "/content/unzipped_data/player_regular_season.txt"
player_data = pd.read_csv(file_path)

# Select relevant features for outlier detection
features = ['stl', 'blk', 'turnover', 'pf', 'fga', 'fgm', 'fta', 'ftm', 'tpa', 'tpm']

# Prepare the data for outlier detection
X = player_data[features]
names = player_data[['firstname', 'lastname']]  # Select firstname and lastname columns

# Impute missing values with the mean of each column
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Create a Local Outlier Factor model
lof_model = LocalOutlierFactor(contamination=0.01)

# Fit the model to the imputed data and predict outliers
outliers = lof_model.fit_predict(X_imputed)

# The 'outliers' array contains labels (-1 for outliers, 1 for inliers)

outlier_indices = outliers == -1

# Extract the names of the potential outlier players based on the indices
outlier_players_names = names.loc[outlier_indices]

# Remove duplicate names
outlier_players_names = outlier_players_names.drop_duplicates()

# Print the results
print(f"Count of Potential Outlier Players (LOF): {len(outlier_players_names)}")
print("Names of Potential Outlier Players:")
print(outlier_players_names)



Count of Potential Outlier Players (LOF): 147
Names of Potential Outlier Players:
      firstname  lastname
2          Norm     Baker
50          Joe     Fulks
90          Ken    Keller
105      Donald    Martin
235      Wyndol      Gray
...         ...       ...
18703      Luis    Flores
18705     Danny   Fortson
18826   Brandin    Knight
18827      Kyle    Korver
18889    Alonzo  Mourning

[147 rows x 2 columns]


# Local Outlier Factor - Player Allstar

In [None]:
import pandas as pd
from sklearn.neighbors import LocalOutlierFactor
from sklearn.impute import SimpleImputer

# Load the player data
data_folder_path = "/content/unzipped_data"
file_path = "/content/unzipped_data/player_allstar.txt"
player_data = pd.read_csv(file_path)

# Select relevant features for outlier detection
features = ['stl', 'blk', 'turnover', 'pf', 'fga', 'fgm', 'fta', 'ftm', 'tpa', 'tpm']

# Prepare the data for outlier detection
X = player_data[features]
names = player_data[['firstname', 'lastname']]  # Select firstname and lastname columns

# Impute missing values with the mean of each column
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Create a Local Outlier Factor model
lof_model = LocalOutlierFactor(contamination=0.01)

# Fit the model to the imputed data and predict outliers
outliers = lof_model.fit_predict(X_imputed)

# The 'outliers' array contains labels (-1 for outliers, 1 for inliers)
outlier_indices = outliers == -1

# Extract the names of the potential outlier players based on the indices
outlier_players_names = names.loc[outlier_indices]
# Remove duplicate names
outlier_players_names = outlier_players_names.drop_duplicates()
# Print the results
print(f"Count of Potential Outlier Players (LOF): {len(outlier_players_names)}")
print("Names of Potential Outlier Players:")
print(outlier_players_names)

Count of Potential Outlier Players (LOF): 14
Names of Potential Outlier Players:
     firstname      lastname
3       Kareem  Abdul-Jabbar
22        Mark       Aguirre
45        Paul        Arizin
77        Rick         Barry
151       Carl         Braun
267      Terry      Cummings
561      Elvin         Hayes
691      Jimmy         Jones
746      Jason          Kidd
872        Bob        McAdoo
881     George      McGinnis
904       Bill    Melchionni
944     Alonzo      Mourning
1220      Bill       Sharman


#Local Outlier Factor - Player Playoffs

In [None]:
import pandas as pd
from sklearn.neighbors import LocalOutlierFactor
from sklearn.impute import SimpleImputer

# Load the player data
data_folder_path = "/content/unzipped_data"
file_path = "/content/unzipped_data/player_playoffs_career.txt"
player_data = pd.read_csv(file_path)

# Select relevant features for outlier detection
features = ['stl', 'blk', 'turnover', 'pf', 'fga', 'fgm', 'fta', 'ftm', 'tpa', 'tpm']

# Prepare the data for outlier detection
X = player_data[features]
names = player_data[['firstname', 'lastname']]  # Select firstname and lastname columns

# Impute missing values with the mean of each column
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Create a Local Outlier Factor model
lof_model = LocalOutlierFactor(contamination=0.01)

# Fit the model to the imputed data and predict outliers
outliers = lof_model.fit_predict(X_imputed)

# The 'outliers' array contains labels (-1 for outliers, 1 for inliers)

outlier_indices = outliers == -1

# Extract the names of the potential outlier players based on the indices
outlier_players_names = names.loc[outlier_indices]
# Remove duplicate names
outlier_players_names = outlier_players_names.drop_duplicates()

# Print the results
print(f"Count of Potential Outlier Players (LOF): {len(outlier_players_names)}")
print("Names of Potential Outlier Players:")
print(outlier_players_names)

Count of Potential Outlier Players (LOF): 21
Names of Potential Outlier Players:
     firstname     lastname
94        Drew        Barry
186    Michael      Bradley
288    Maurice       Carter
290        Ron       Carter
504       Doug      Edwards
652     Travis        Grant
700       Jeff  Halliburton
718       Skip     Harlicka
905        Ken      Johnson
920      Alvin        Jones
1155  Fernando       Martin
1173       Mel      Mccants
1319     Barry       Nelson
1321       Ron       Nelson
1345   Moochie       Norris
1400    Andrae    Patterson
1472       Bob       Priddy
1482       Ray      Ragelis
1781     Billy       Thomas
1868     David       Vaughn
2037       Tom      Workman


#Random Forest - Team Season

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# Load the team data
file_path = "/content/unzipped_data/team_season.txt"
team_data = pd.read_csv(file_path)

# Select relevant features for prediction
features = ['o_fgm', 'o_fga', 'o_ftm', 'o_fta', 'o_oreb', 'o_dreb', 'o_reb', 'o_asts', 'o_pf', 'o_stl', 'o_to', 'o_blk', 'o_3pm', 'o_3pa', 'o_pts',
            'd_fgm', 'd_fga', 'd_ftm', 'd_fta', 'd_oreb', 'd_dreb', 'd_reb', 'd_asts', 'd_pf', 'd_stl', 'd_to', 'd_blk', 'd_3pm', 'd_3pa', 'd_pts']

# Create a target variable indicating the number of wins for the season
target = 'won'

# Prepare the data for prediction
X = team_data[features]
y = team_data[target]

# Create a Random Forest regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on the entire dataset
rf_model.fit(X, y)

# Input the team names for prediction
team1_name = "INJ"
team2_name = "BOS"

# Extract the statistics for the two teams from the dataset
team1_stats = team_data[team_data['team'] == team1_name][features].values
team2_stats = team_data[team_data['team'] == team2_name][features].values

# Concatenate the statistics for both teams
teams_combined = pd.DataFrame(data=pd.concat([pd.DataFrame(team1_stats, columns=features),
                                              pd.DataFrame(team2_stats, columns=features)], axis=0))

# Predict the number of wins for the combined team statistics
predicted_wins = rf_model.predict(teams_combined)

# Determine the winning team based on the higher predicted number of wins
winning_team = team1_name if predicted_wins[0] > predicted_wins[1] else team2_name

# Print the predicted number of wins and the winning team
print(f"Predicted Number of Wins (Random Forest): {team1_name}: {predicted_wins[0]}, {team2_name}: {predicted_wins[1]}")
print(f"Predicted Winning Team: {winning_team}")

# Calculate the training accuracy (R-squared)
training_accuracy = rf_model.score(X, y)

# Print the training accuracy
print(f"Training Accuracy (R-squared): {training_accuracy}")




Predicted Number of Wins (Random Forest): INJ: 21.6, BOS: 23.04
Predicted Winning Team: BOS
Training Accuracy (R-squared): 0.971510113337355
