In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
# Import our input dataset
stats_df = pd.read_csv('stats_target_clean2.csv')
stats_df.head()

Unnamed: 0,Position,Season,Tm,Team_Name,Lg,G,W,L,W/L%,MOV,...,eFG%,TS%,TOV%,ORB%,FT/FGA,opp_eFG%,opp_TS%,opp_TOV%,opp_ORB%,opp_FT/FGA
0,0,2020-21,WAS,Washington Wizards,NBA,72,34,38,0.472,-1.83,...,0.531,0.569,12.3,21.3,0.221,0.539,0.576,12.5,22.4,0.217
1,0,2020-21,UTA,Utah Jazz,NBA,72,52,20,0.722,9.25,...,0.563,0.597,12.7,24.5,0.195,0.507,0.537,10.3,20.7,0.159
2,0,2020-21,TOR,Toronto Raptors,NBA,72,27,45,0.375,-0.47,...,0.529,0.567,11.9,20.8,0.196,0.543,0.584,14.4,23.7,0.234
3,0,2020-21,SAS,San Antonio Spurs,NBA,72,33,39,0.458,-1.74,...,0.517,0.554,10.2,20.0,0.192,0.541,0.57,11.8,22.7,0.174
4,0,2020-21,SAC,Sacramento Kings,NBA,72,31,41,0.431,-3.68,...,0.549,0.578,12.0,21.3,0.185,0.557,0.591,12.1,25.0,0.199


Definition of Stats: 

    1. Position -- 1 = winner, 0 = not winner --- TARGET
    2. Season -- Season --- Save for reference
    3. TM -- Team name abreviation --- Save for reference
    4. Team_Name -- Name --- Save for reference
    5. Lg -- League --- Not needed
    6. G -- Games --- Not needed
    7. W -- Wins --- Not needed
    8. L -- Losses --- Not needed
    9. W/L% -- Win-Loss Percentage Miscellaneous --- Needed
    10. MOV -- Margin of Victory --- Needed
    11. SOS -- Strength of Schedule; a rating of strength of schedule. The rating is denominated in points above/below average, where zero is average. --- Needed
    12. SRS -- Simple Rating System; a team rating that takes into account average point differential and strength of schedule. The rating is denominated in points above/below average, where zero is average. --- Needed
    13. Pace -- Pace Factor: An estimate of possessions per 48 minutes --- Needed
    14. ORtg -- Offensive Rating An estimate of points produced (players) or scored (teams) per 100 possessions --- Needed
    15. DRtg -- Defensive Rating An estimate of points allowed per 100 possessions Team eFG% Effective Field Goal Percentage This statistic adjusts for the fact that a 3-point field goal is worth one more point than a 2-point field goal. --- Needed
    16. TS% -- True Shooting Percentage A measure of shooting efficiency that takes into account 2-point field goals, 3-point field goals, and free throws. --- Needed
    17. TOV% -- Turnover Percentage An estimate of turnovers committed per 100 plays. 
    18. ORB% -- Offensive Rebound Percentage An estimate of the percentage of available offensive rebounds a player grabbed while they were on the floor. --- Needed
    19. FT/FGA -- Free Throws Per Field Goal Attempt Opponent --- Needed
    20. eFG% -- Opponent Effective Field Goal Percentage --- Needed
    21. TS% -- Opponent True Shooting Percentage --- Needed
    22. TOV% -- Opponent Turnover Percentage --- Needed
    23. ORB% -- Opponent Offensive Rebound Percentage --- Needed
    24. FT/FGA -- Opponent Free Throws Per Field Goal Attempt--- Needed

In [10]:
# Pull out Season, TM and Team_Name into DF
name_df = stats_df[['Season', 'Tm', 'Team_Name']]
name_df.head()

Unnamed: 0,Season,Tm,Team_Name
0,2020-21,WAS,Washington Wizards
1,2020-21,UTA,Utah Jazz
2,2020-21,TOR,Toronto Raptors
3,2020-21,SAS,San Antonio Spurs
4,2020-21,SAC,Sacramento Kings


In [11]:
# Drop LG, Season, TM, Team_Name, G, W, L columns
stats_df.drop(['Lg','Season','Tm', 'Team_Name', 'G', 'W', 'L'], axis=1, inplace=True)

In [12]:
stats_df.head()

Unnamed: 0,Position,W/L%,MOV,SOS,SRS,Pace,ORtg,DRtg,eFG%,TS%,TOV%,ORB%,FT/FGA,opp_eFG%,opp_TS%,opp_TOV%,opp_ORB%,opp_FT/FGA
0,0,0.472,-1.83,-0.01,-1.85,104.1,111.2,113.0,0.531,0.569,12.3,21.3,0.221,0.539,0.576,12.5,22.4,0.217
1,0,0.722,9.25,-0.29,8.97,98.5,117.6,108.3,0.563,0.597,12.7,24.5,0.195,0.507,0.537,10.3,20.7,0.159
2,0,0.375,-0.47,-0.07,-0.54,99.2,112.0,112.5,0.529,0.567,11.9,20.8,0.196,0.543,0.584,14.4,23.7,0.234
3,0,0.458,-1.74,0.15,-1.58,98.9,111.0,112.8,0.517,0.554,10.2,20.0,0.192,0.541,0.57,11.8,22.7,0.174
4,0,0.431,-3.68,0.23,-3.45,100.0,113.6,117.2,0.549,0.578,12.0,21.3,0.185,0.557,0.591,12.1,25.0,0.199


In [13]:
# Remove Position target from features data
y = stats_df.Position
X = stats_df.drop(columns=["Position"])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [14]:
# Create a logistic regression model
classifier = LogisticRegression(max_iter=1000)
classifier

LogisticRegression(max_iter=1000)

In [15]:
# Fit (train) our model by using the training data
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [16]:
# Validate the model by using the test data
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9685264663805436
Testing Data Score: 0.9743589743589743


## Import new data set

In [17]:
# Import our current year dataset
cy_df = pd.read_csv('cy_stats.csv')
cy_df.head()

Unnamed: 0,Position,Season,Tm,Lg,G,W,L,W/L%,MOV,SOS,...,eFG%,TS%,TOV%,ORB%,FT/FGA,opp_eFG%,opp_TS%,opp_TOV%,opp_ORB%,opp_FT/FGA
0,0,2021-22,MIA,NBA,6,5,1,0.833,17.0,1.07,...,0.525,0.565,13.6,27.5,0.197,0.464,0.501,13.4,15.9,0.181
1,0,2021-22,WAS,NBA,7,5,2,0.714,2.57,1.11,...,0.505,0.549,11.5,20.4,0.213,0.492,0.535,11.0,20.5,0.197
2,0,2021-22,UTA,NBA,6,5,1,0.833,12.83,-3.08,...,0.526,0.574,14.2,24.6,0.236,0.45,0.485,10.8,23.1,0.164
3,0,2021-22,TOR,NBA,8,5,3,0.625,4.63,1.25,...,0.487,0.518,12.1,29.8,0.158,0.521,0.554,16.8,20.6,0.178
4,0,2021-22,SAS,NBA,7,2,5,0.286,-0.43,-1.5,...,0.526,0.542,12.2,22.2,0.113,0.549,0.568,15.2,21.4,0.13


In [18]:
# Pull out Season, TM into DF
cy_name_df = cy_df[['Season', 'Tm']]
cy_name_df.head()

Unnamed: 0,Season,Tm
0,2021-22,MIA
1,2021-22,WAS
2,2021-22,UTA
3,2021-22,TOR
4,2021-22,SAS


In [19]:
# Drop LG, Season, and TM column
cy_df.drop(['Lg','Season','Tm'], axis=1, inplace=True)

In [21]:
cy_df.drop(['G','W','L'], axis=1, inplace=True)

In [22]:
cy_df.head()

Unnamed: 0,Position,W/L%,MOV,SOS,SRS,Pace,ORtg,DRtg,eFG%,TS%,TOV%,ORB%,FT/FGA,opp_eFG%,opp_TS%,opp_TOV%,opp_ORB%,opp_FT/FGA
0,0,0.833,17.0,1.07,18.07,99.9,112.2,95.5,0.525,0.565,13.6,27.5,0.197,0.464,0.501,13.4,15.9,0.181
1,0,0.714,2.57,1.11,3.68,99.0,108.8,106.3,0.505,0.549,11.5,20.4,0.213,0.492,0.535,11.0,20.5,0.197
2,0,0.833,12.83,-3.08,9.76,99.3,111.9,99.0,0.526,0.574,14.2,24.6,0.236,0.45,0.485,10.8,23.1,0.164
3,0,0.625,4.63,1.25,5.88,97.6,107.4,102.7,0.487,0.518,12.1,29.8,0.158,0.521,0.554,16.8,20.6,0.178
4,0,0.286,-0.43,-1.5,-1.93,101.7,106.6,107.0,0.526,0.542,12.2,22.2,0.113,0.549,0.568,15.2,21.4,0.13


In [23]:
# Remove Position target from features data
y1 = cy_df.Position
X1 = cy_df.drop(columns=["Position"])

In [24]:
X1.head()

Unnamed: 0,W/L%,MOV,SOS,SRS,Pace,ORtg,DRtg,eFG%,TS%,TOV%,ORB%,FT/FGA,opp_eFG%,opp_TS%,opp_TOV%,opp_ORB%,opp_FT/FGA
0,0.833,17.0,1.07,18.07,99.9,112.2,95.5,0.525,0.565,13.6,27.5,0.197,0.464,0.501,13.4,15.9,0.181
1,0.714,2.57,1.11,3.68,99.0,108.8,106.3,0.505,0.549,11.5,20.4,0.213,0.492,0.535,11.0,20.5,0.197
2,0.833,12.83,-3.08,9.76,99.3,111.9,99.0,0.526,0.574,14.2,24.6,0.236,0.45,0.485,10.8,23.1,0.164
3,0.625,4.63,1.25,5.88,97.6,107.4,102.7,0.487,0.518,12.1,29.8,0.158,0.521,0.554,16.8,20.6,0.178
4,0.286,-0.43,-1.5,-1.93,101.7,106.6,107.0,0.526,0.542,12.2,22.2,0.113,0.549,0.568,15.2,21.4,0.13


In [25]:
y1

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
Name: Position, dtype: int64

In [26]:
# Split training/test datasets
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, random_state=1)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X1_scaler = scaler.fit(X1_train)

# Scale the data
X1_train_scaled = X1_scaler.transform(X1_train)
X1_test_scaled = X1_scaler.transform(X1_test)

In [27]:
# Create a logistic regression model
classifier = LogisticRegression(max_iter=1000)
classifier

LogisticRegression(max_iter=1000)

In [28]:
# Fit (train) our model by using the training data
classifier.fit(X1_train, y1_train)

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0

In [None]:
# joined_df = test_df.join(stats_df)
# joined_df.head(100)

In [None]:
# full_df = joined_df.join(name_df)
# full_df.head()

In [None]:
# full_df.columns

In [None]:
# full2_df = full_df[['Season',
#        'Tm', 'Team_Name','Prediction', 'Actual', 'Position', 'G', 'W', 'L', 'W/L%', 'MOV', 'SOS',
#        'SRS', 'Pace', 'ORtg', 'DRtg', 'eFG%', 'TS%', 'TOV%', 'ORB%', 'FT/FGA',
#        'opp_eFG%', 'opp_TS%', 'opp_TOV%', 'opp_ORB%', 'opp_FT/FGA' 
# ]]
# full2_df.head()

In [None]:
# full2_df.groupby('Season').sum()

In [None]:
# y = name_df.Team_Name
# X = name_df.drop(columns=["Team_Name"])
# y

In [None]:
# # Split training/test datasets
# X1_train, X1_test, y1_train, y1_test = train_test_split(X, y, random_state=1, stratify=y)

In [None]:
# y1_test