# Replikacija rezultata
### Fran Maznik 

#### Ucitavanje prethodno pripremljenih podataka

In [112]:
import pandas as pd
import numpy as np

In [113]:
data = pd.read_csv('data/my_shot_logs.csv')
data.shape

(125751, 19)

In [114]:
y = data["FGM"]  

In [115]:
data.drop(["FGM", "SHOT_RESULT"], axis=1, inplace=True)


Koristeni su algoritmi su: logistička regresija, svm, neuronska mreža, naivni bayesov klasifikator, random forests i boosting.
Za logističku regresiju, svm i neurosnku mrežu odbacujemo nebrojčane značajke


In [116]:
X = data.drop(
["GAME_ID", "MATCHUP", "LOCATION", "W", "CLOSEST_DEFENDER" ,"CLOSEST_DEFENDER_PLAYER_ID", "player_name", "player_id"]
, axis=1)


In [117]:
X.head()

Unnamed: 0,FINAL_MARGIN,SHOT_NUMBER,PERIOD,GAME_CLOCK,SHOT_CLOCK,DRIBBLES,SHOT_DIST,PTS_TYPE,CLOSE_DEF_DIST
0,24,1,1,69,10.8,2,7.7,2,1.3
1,24,2,1,14,3.4,0,28.2,3,6.1
2,24,3,1,0,0.0,3,10.1,2,0.9
3,24,4,2,707,10.3,2,17.2,2,3.4
4,24,5,2,634,10.9,2,3.7,2,1.1


### Priprema podataka 
Podatci su podijeljeni u omjeru 95/5 za ucenje i ispitvanje

In [118]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.05)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


### 1. Logisticka regresija

In [119]:
from sklearn.metrics import  confusion_matrix

def my_confusion_matrix(y_true, y_predicted):
    labels = ["p", "n"]
    columns = [f"predicted {label}" for label in labels]
    index = [f"actual {label}" for label in labels]
    table = pd.DataFrame(confusion_matrix(y_true, y_predicted), columns=columns, index=index)
    return table


In [120]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression(penalty="l2", C = 1 / 0.0001 ).fit(X_train,y_train)


In [121]:
accuracy_score(y_test, model.predict(X_test))

0.6049618320610687

Točnost u članku za logističku regresiju iznosi 59%

In [122]:
my_confusion_matrix(y_test, model.predict(X_test))

Unnamed: 0,predicted p,predicted n
actual p,2481,1001
actual n,1483,1323


### 2. SVM

In [123]:
from sklearn.model_selection import train_test_split


selected_indices = np.random.choice(len(data), size=40000, replace=False)

X_selected = X.iloc[selected_indices]
y_selected = y[selected_indices]

X_train, X_test, y_train, y_test = train_test_split(X_selected, y_selected, test_size=0.05, random_state=42)


In [124]:
from sklearn.svm import SVC
model = SVC()

In [125]:
model.fit(X_train, y_train)
accuracy_score(y_test, model.predict(X_test))

0.5935

Točnost u članku za SVM iznosi 55%


In [126]:
my_confusion_matrix(y_test, model.predict(X_test))


Unnamed: 0,predicted p,predicted n
actual p,729,406
actual n,407,458


### 3. Neuronske mreze

In [72]:
from sklearn.model_selection import train_test_split


selected_indices = np.random.choice(len(data), size=90000, replace=False)

X_selected = X.iloc[selected_indices]
y_selected = y[selected_indices]

X_train, X_test, y_train, y_test = train_test_split(X_selected, y_selected, test_size=0.05, random_state=42)


In [73]:
from sklearn.neural_network import MLPClassifier

In [74]:
clf = MLPClassifier(activation="relu",solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(50), max_iter = 3000)

In [75]:
clf.fit(X_train, y_train)
accuracy_score(y_test, model.predict(X_test))


0.6002222222222222

54% tocnost u članku

In [78]:
my_confusion_matrix(y_test, model.predict(X_test))


Unnamed: 0,predicted p,predicted n
actual p,1643,768
actual n,1031,1058


Za bayesov kalsifikator, random forest, XGB boost koriste kategoričke značajke

In [91]:
data.head()

X = data.drop(
[ "CLOSEST_DEFENDER" , "player_name", ], axis=1)
X.head()

Unnamed: 0,GAME_ID,MATCHUP,LOCATION,W,FINAL_MARGIN,SHOT_NUMBER,PERIOD,GAME_CLOCK,SHOT_CLOCK,DRIBBLES,SHOT_DIST,PTS_TYPE,CLOSEST_DEFENDER_PLAYER_ID,CLOSE_DEF_DIST,player_id
0,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,1,1,69,10.8,2,7.7,2,101187,1.3,203148
1,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,2,1,14,3.4,0,28.2,3,202711,6.1,203148
2,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,3,1,0,0.0,3,10.1,2,202711,0.9,203148
3,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,4,2,707,10.3,2,17.2,2,203900,3.4,203148
4,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,5,2,634,10.9,2,3.7,2,201152,1.1,203148


In [95]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd


categorical_columns = ["GAME_ID", 'MATCHUP', 'LOCATION', "W", 'CLOSEST_DEFENDER_PLAYER_ID',"player_id"]
#["GAME_ID", "MATCHUP", "LOCATION", "W", "CLOSEST_DEFENDER" ,"CLOSEST_DEFENDER_PLAYER_ID", "player_name", "player_id"]


# Extract categorical columns and convert to numpy array
X_categorical = X[categorical_columns].values

# Initialize OneHotEncoder
onehot_encoder = OneHotEncoder(sparse=False)

# Fit and transform the categorical data
X_encoded = onehot_encoder.fit_transform(X_categorical)

# Create a DataFrame with the encoded data
df_encoded = pd.DataFrame(X_encoded, columns=onehot_encoder.get_feature_names_out(categorical_columns))

# Drop the original categorical columns from the original DataFrame
X = X.drop(columns=categorical_columns)

# Concatenate the encoded DataFrame with the original DataFrame
X = pd.concat([X, df_encoded], axis=1)

# Display the resulting DataFrame with one-hot encoded variables
X.head()




Unnamed: 0,FINAL_MARGIN,SHOT_NUMBER,PERIOD,GAME_CLOCK,SHOT_CLOCK,DRIBBLES,SHOT_DIST,PTS_TYPE,CLOSE_DEF_DIST,GAME_ID_21400001,...,player_id_203925,player_id_203932,player_id_203935,player_id_203952,player_id_203957,player_id_203960,player_id_203994,player_id_204014,player_id_204037,player_id_204060
0,24,1,1,69,10.8,2,7.7,2,1.3,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,24,2,1,14,3.4,0,28.2,3,6.1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,24,3,1,0,0.0,3,10.1,2,0.9,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,24,4,2,707,10.3,2,17.2,2,3.4,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,24,5,2,634,10.9,2,3.7,2,1.1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 4. Bayes

In [96]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler


X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.05)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [97]:
model = MultinomialNB()

model.fit(X_train, y_train)
print()




In [98]:
accuracy_score(y_test, model.predict(X_test))

0.5446882951653944

54% u clanku

In [99]:
my_confusion_matrix(y_test, model.predict(X_test))


Unnamed: 0,predicted p,predicted n
actual p,2287,1098
actual n,1765,1138


### 5. Random forest


In [127]:
from sklearn.model_selection import train_test_split


selected_indices = np.random.choice(len(data), size=100000, replace=False)

X_selected = X.iloc[selected_indices]
y_selected = y[selected_indices]

X_train, X_test, y_train, y_test = train_test_split(X_selected, y_selected, test_size=0.05, random_state=42)


In [101]:
from sklearn.ensemble import RandomForestClassifier


model = RandomForestClassifier()
model.fit(X_train, y_train)


In [102]:
accuracy_score(y_test, model.predict(X_test))


0.6091111111111112

55% u clanku

### 6. XGB boosting


In [106]:
import xgboost as xgb

In [108]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

clf = xgb.XGBClassifier(max_depth = 3,learning_rate=0.00001 ,objective='binary:logistic',min_child_weight= 0.0001, random_state=42)

clf.fit(X_train, y_train)

accuracy_score(y_test, clf.predict(X_test))


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


0.6111641221374046

68% u clanku