In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

#### Reading in the CSV files and cleaning the data set is the first step to the Machine Learning model analysis.

Here we will create a list of the column names and a list with the target column. These lists will be used after we load in the csv file in order to keep the columns that are needed in the analysis.

In [7]:
columns = ["Age", "Year", "Team", "Division", "League", "Percent_Season_Played", "Num_DL_Movements",
          "Percent_Contract_Complete", "Player_Salary", "Team_Payroll", "Team_Win_Percentage"]

target = ["WAR_PSP"]


Loading in the data and cleaning it will be the next steps in this analysis. This will allow us to separate the data into testing and training sets. This is a resampling method. 

In [10]:
MLB_data = Path('Resources/all_players.csv')

MLB_df = pd.read_csv(MLB_data)


# dropping the null values will help us get rid of any empty data  
# this will prevent skewed results - we do this to both rows and columns

# dropping null rows
MLB_df = MLB_df.dropna()

# converting to numerical values

# converting target column values to high_WAR and low_WAR based on their values

# This will be done using a boundary 

#x = { 'Current' : 'low_WAR'}
#MLB_df = MLB_df.replace(x)

# x dict from keys - need to look at the final csv file to 

# x = dict.fromkeys([])
# MLB_df = MLB_df.replace(x)

# resetting the index 

MLB_df.reset_index(inplace=True, drop=True)

# display our MLB data frame 

MLB_df.head()

Unnamed: 0,Name,Age,Year,Team,League,Player_Salary,Percent_Season_Played,WAR_PSP,Num_DL_Movements,Team_Win_Percentage,Team_Payroll,Percent_Contract_Complete
0,Bobby Abreu,26,2000,PHI,NL,4354977,0.994676,6.213081,0,0.401235,69700930.0,33.333333
1,Bobby Abreu,27,2001,PHI,NL,7193330,1.029782,5.039903,0,0.530864,60144840.0,66.666667
2,Bobby Abreu,28,2002,PHI,NL,9000338,1.001989,5.818425,0,0.493827,82364470.0,100.0
3,Bobby Abreu,29,2003,PHI,NL,12643905,1.016617,5.282226,0,0.530864,98344580.0,16.666667
4,Bobby Abreu,30,2004,PHI,NL,14346025,1.042947,6.289872,0,0.530864,126162700.0,33.333333


#### Now we will split our data into training and test sets 


In [None]:
# Creating feautres

X = MLB_df.drop(columns="WAR_PSP")
X = pd.get_dummies(X)

# Creating our target
y = MLB_df.loc[:, target].copy()

In [None]:
X.describe()

In [None]:
# check the balance of our target values 
y["WAR_PSP"].value_counts()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


### Method 1: Oversampling
comparing two oversampling algorithms, naive and SMOTE, will lead us to the best performing algorithm. 

The steps within both oversampling methods are as follows

1. View the count of the target classes using Counter from the collections library.
    1. Viewing the count of the target classes will allow us to see the official number of target variables that are used in this approach.
2. Use the resampled data to train a logistic regression model.
    1.  reamspling the training data to a logistic regression model is an attempt to have more accurate representation of points
3. Calculate the balanced accuracy score from sklearn.metrics.
    1. The balanced accuracy score will inform us on whether or not this method is statistically significant or not - if the accuracy is 95% or higher the model is considered to be an accurate representation. 
4. Print the confusion matrix from sklearn.metrics.
5. Generate a classication report using the imbalanced_classification_report from imbalanced-learn.
