In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np

In [2]:
df = pd.read_csv("kobe.csv")
data = df.dropna() # Suppression des lignes(les tirs) dont nous ne connaissons pas l'issue(marqué ou manqué)
qualitative_vars = ["action_type", "combined_shot_type", "game_event_id", "game_id", 
                    "period", "playoffs", "season", "shot_type",
                    "shot_zone_area", "shot_zone_basic", "shot_zone_range", "team_name", "game_date", "matchup", "opponent"]
quantitative_vars = [col for col in df.columns if col not in qualitative_vars + ["shot_made_flag", "shot_id", 'team_id']]  # garder le shot_id ? 
X = data[quantitative_vars+qualitative_vars]  # Features : éléments ayant un impact lors de la prise du tir
y = data["shot_made_flag"]  # Target : issue du tir

In [3]:
from sklearn.preprocessing import LabelEncoder

In [4]:
label_encoder = LabelEncoder()

encoded_vars = []
for categorical_var in data[qualitative_vars].T.values:
    print(categorical_var.shape)
    encoded_vars.append(label_encoder.fit_transform(categorical_var))
    print(f"===== Encoded classes: {label_encoder.classes_}")

(25697,)
===== Encoded classes: ['Alley Oop Dunk Shot' 'Alley Oop Layup shot' 'Cutting Layup Shot'
 'Driving Bank shot' 'Driving Dunk Shot' 'Driving Finger Roll Layup Shot'
 'Driving Finger Roll Shot' 'Driving Floating Bank Jump Shot'
 'Driving Floating Jump Shot' 'Driving Hook Shot' 'Driving Jump shot'
 'Driving Layup Shot' 'Driving Reverse Layup Shot'
 'Driving Slam Dunk Shot' 'Dunk Shot' 'Fadeaway Bank shot'
 'Fadeaway Jump Shot' 'Finger Roll Layup Shot' 'Finger Roll Shot'
 'Floating Jump shot' 'Follow Up Dunk Shot' 'Hook Bank Shot' 'Hook Shot'
 'Jump Bank Shot' 'Jump Hook Shot' 'Jump Shot' 'Layup Shot'
 'Pullup Bank shot' 'Pullup Jump shot' 'Putback Dunk Shot'
 'Putback Layup Shot' 'Putback Slam Dunk Shot' 'Reverse Dunk Shot'
 'Reverse Layup Shot' 'Reverse Slam Dunk Shot' 'Running Bank shot'
 'Running Dunk Shot' 'Running Finger Roll Layup Shot'
 'Running Finger Roll Shot' 'Running Hook Shot' 'Running Jump Shot'
 'Running Layup Shot' 'Running Pull-Up Jump Shot'
 'Running Reverse Lay

In [5]:
label_encoders = {}

# Pour chaque feature catégorielle, créez un LabelEncoder et stockez-le dans le dictionnaire.
for feature in qualitative_vars:
    le = LabelEncoder()
    # Ajuster le LabelEncoder sur les données textuelles.
    data.loc[:, feature] = le.fit_transform(data[feature])
    
    # Stocker le LabelEncoder dans le dictionnaire en utilisant le nom de la feature comme clé.
    label_encoders[feature] = le

In [6]:
from sklearn.preprocessing import LabelEncoder
label_encoders = {feature: LabelEncoder().fit(data[feature]) for feature in qualitative_vars}
# Supposons que vous avez un DataFrame 'data', une liste 'categorical_features'
# et un dictionnaire 'label_encoders' qui contient chaque LabelEncoder utilisé pour chaque feature.
# 'target' est la variable cible continue.
data_copy = data.copy() # Créer une copie explicite du DataFrame
# Pour chaque feature catégorielle:
for feature in qualitative_vars:
    # Obtenir l'objet LabelEncoder pour la feature.
    le = label_encoders[feature]
    
    # Inverser le LabelEncoder pour récupérer les catégories originales.
    data_copy[feature + '_original'] = le.inverse_transform(data[feature].astype(int))
    
    # Calculer les moyennes de la variable cible pour chaque catégorie originale.
    means = data_copy.groupby(feature + '_original')['shot_made_flag'].mean()
    
    # Mappez les moyennes sur les valeurs encodées en utilisant les catégories originales.
    data_copy[feature + '_encoded'] = data_copy[feature + '_original'].map(means)

# Après cela, 'data' contiendra de nouvelles colonnes avec les encodages basés sur la réponse.
# Vous pouvez maintenant utiliser ces nouvelles colonnes dans vos calculs de corrélation ou dans les modèles prédictifs.

# Assurez-vous de retirer les colonnes intermédiaires si elles ne sont pas nécessaires.
data_copy.drop([feature + '_original' for feature in qualitative_vars], axis=1, inplace=True)

In [7]:
# Calculer la matrice de corrélation
correlation_matrix = data_copy.corr()

# Obtenir la corrélation de 'shot_made_flag' avec toutes les autres variables
target_correlation = correlation_matrix['shot_made_flag']

# Afficher les corrélations entre 'shot_made_flag' et les colonnes encodées
for feature in qualitative_vars:
    encoded_feature_name = feature + '_encoded'
    if encoded_feature_name in data_copy.columns:
        print(f"shot_made_flag et {encoded_feature_name}: {target_correlation[encoded_feature_name]}")

shot_made_flag et action_type_encoded: 0.37654915475487005
shot_made_flag et combined_shot_type_encoded: 0.24652540101945739
shot_made_flag et game_event_id_encoded: 0.15738032249532216
shot_made_flag et game_id_encoded: 0.24289114153332728
shot_made_flag et period_encoded: 0.03836425742636405
shot_made_flag et playoffs_encoded: 0.0012568981415939943
shot_made_flag et season_encoded: 0.049762515365669266
shot_made_flag et shot_type_encoded: 0.12146225672937402
shot_made_flag et shot_zone_area_encoded: 0.14862473586264693
shot_made_flag et shot_zone_basic_encoded: 0.20732578770279805
shot_made_flag et shot_zone_range_encoded: 0.18721452309462025
shot_made_flag et team_name_encoded: nan
shot_made_flag et game_date_encoded: 0.24289114153332728
shot_made_flag et matchup_encoded: 0.05439574857945151
shot_made_flag et opponent_encoded: 0.035722169287886456


In [8]:
X_categorical = np.array(encoded_vars).T

In [5]:
# Fit the OneHotEncoder to the categorical variables.
one_hot_encoder = OneHotEncoder(sparse=False)
one_hot_encoder.fit(data[qualitative_vars])



In [6]:
# Transform the categorical variables and convert to DataFrame.
one_hot_encoded_data = one_hot_encoder.transform(data[qualitative_vars])

In [8]:
one_hot_encoded_data.shape

(25697, 3952)

In [5]:
# Make sure to properly align the indices and columns
one_hot_encoded_df = pd.DataFrame(one_hot_encoded_data, 
                                  index=data.index, 
                                  columns=one_hot_encoder.get_feature_names_out(qualitative_vars))

In [6]:
# Now join the one-hot encoded DataFrame with the target variable `y`.
# Ensure that `y` has the same index as `one_hot_encoded_df`.
merged_df = pd.concat([one_hot_encoded_df, data['shot_made_flag'].reset_index(drop=True)], axis=1)

In [8]:
# Now calculate the correlation matrix.
correlation_matrix = merged_df.corr()

In [None]:
# Get the correlation of all variables with 'shot_made_flag'.
correlation_with_target = correlation_matrix.loc[:, 'shot_made_flag']

# Show the correlation with the target variable.
print(correlation_with_target)

In [None]:
# Reset the index of `data` to align with the new DataFrame.
data.reset_index(drop=True, inplace=True)

# Join the one-hot encoded DataFrame with the target variable `y`.
one_hot_encoded_df = pd.concat([one_hot_encoded_df, data[['shot_made_flag']]], axis=1)

# Now calculate the correlation matrix.
correlation_matrix = one_hot_encoded_df.corr()

# Get the correlation of all variables with 'shot_made_flag'.
correlation_with_target = correlation_matrix['shot_made_flag']

# Show the correlation with the target variable.
print(correlation_with_target)

In [17]:
from sklearn.preprocessing import OneHotEncoder

In [27]:
correlation_matrix = data[quantitative_vars+["shot_made_flag"]].corr()
correlation_matrix['shot_made_flag']

lat                  0.148070
loc_x               -0.000848
loc_y               -0.148070
lon                 -0.000848
minutes_remaining    0.028342
seconds_remaining    0.030804
shot_distance       -0.198242
shot_made_flag       1.000000
Name: shot_made_flag, dtype: float64

On observe que les variables qui jouent le plus sont 'lat' 'loc_y' et 'shot distance' On les inscrit dans qualitative_vars

In [13]:
one_hot_encoded_X.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [14]:
one_hot_encoder.get_feature_names_out()

array(['action_type_Alley Oop Dunk Shot',
       'action_type_Alley Oop Layup shot',
       'action_type_Cutting Finger Roll Layup Shot', ..., 'opponent_UTA',
       'opponent_VAN', 'opponent_WAS'], dtype=object)

In [16]:
# Utilisation de OneHotEncoder pour les variables catégorielles
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), qualitative_vars)
    ],
    remainder='passthrough'
)

In [15]:
from sklearn.metrics import precision_score, recall_score, accuracy_score

In [14]:
def apprentissage(preprocessor, classifier, X_train, y_train, X_test, y_test):
    # Création du pipeline
    pipeline = Pipeline([
        ('classifier', classifier)
    ])
    # Entraînement du modèle
    pipeline.fit(X_train, y_train)

    # Prédiction sur l'ensemble de test
    prediction = pipeline.predict(X_test)

    # Évaluation du modèle
    accuracy = accuracy_score(y_test, prediction)
    precision = precision_score(y_test, prediction, zero_division=0)
    recall = recall_score(y_test, prediction)    
    print(f"a = {accuracy}, p = {precision}, r = {recall}")

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

## Choix et optimisation de l'algorithme d'apprentissage

In [39]:
# Choix de l'algorithme de classification et de ses paramètres
for k in range(41, 91, 10):
    #classifier = LogisticRegression(max_iter=10000)
    classifier = KNeighborsClassifier(n_neighbors=k, p=1)
    #classifier = RandomForestClassifier(max_depth=6, random_state=0)
    print(f"========= Accuracy of our model for i = {k} ============")
    apprentissage(preprocessor, classifier)

a = 0.5979180854168693, p = 0.5700063211125158, r = 0.3941215034965035
a = 0.598307228329604, p = 0.5728725138571894, r = 0.3839597902097902
a = 0.6005448000778286, p = 0.5812586445366529, r = 0.3673513986013986
a = 0.6014203716314817, p = 0.5854136947218259, r = 0.35871940559440557
a = 0.6018581574083082, p = 0.5881816523800839, r = 0.352381993006993


In [18]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

# Supposons que X_train et y_train contiennent vos données d'entraînement

# Créer un objet de régression Lasso
lasso = Lasso(alpha=0.1)  # Vous pouvez ajuster alpha selon vos besoins

# Diviser les données en ensembles d'entraînement et de test
X = data_copy[qualitative_vars+quantitative_vars]
y = data_copy["shot_made_flag"]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.8, random_state=0)


# Sélectionner les fonctionnalités les plus importantes
selector = SelectFromModel(lasso)
selector.fit(X_train, y_train)

# Obtenir les indices des fonctionnalités sélectionnées
selected_features_indices = selector.get_support(indices=True)

# Obtenir les noms des fonctionnalités sélectionnées
selected_features_names = X_train.columns[selected_features_indices]

# Créer un nouveau DataFrame avec seulement les fonctionnalités sélectionnées
X_train_selected = X_train[selected_features_names]


In [22]:
for k in range(1):
    #classifier = LogisticRegression(max_iter=10000)
    classifier = KNeighborsClassifier(n_neighbors=41, p=1)
    #classifier = RandomForestClassifier(max_depth=6, random_state=0)
    print(f"========= Accuracy of our model for i = {k} ============")
    apprentissage(preprocessor, classifier, X_train, y_train, X_test, y_test)

a = 0.5879462982780426, p = 0.5447849533079048, r = 0.45257867132867136
