In [149]:
import pandas as pd
import numpy as np
import pycaret.classification as pc
import matplotlib.pyplot as plt
import mlflow

In [150]:
# Configurar mlflow
mlflow.set_tracking_uri("sqlite:///mlruns.db")

experiment_name = 'Projeto Kobe'
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None:
    experiment_id = mlflow.create_experiment(experiment_name)
    experiment = mlflow.get_experiment(experiment_id)
experiment_id = experiment.experiment_id

In [151]:
df_dev = pd.read_parquet('D:\\infnet\\engenharia_ml\\kobe_dataset\\data\\raw\\dataset_kobe_dev.parquet')
data_cols = ['lat','lon','minutes_remaining','period','playoffs','shot_distance','shot_made_flag']
df_dev.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24271 entries, 0 to 30696
Data columns (total 25 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   action_type         24271 non-null  object 
 1   combined_shot_type  24271 non-null  object 
 2   game_event_id       24271 non-null  int64  
 3   game_id             24271 non-null  int64  
 4   lat                 24271 non-null  float64
 5   loc_x               24271 non-null  int64  
 6   loc_y               24271 non-null  int64  
 7   lon                 24271 non-null  float64
 8   minutes_remaining   24271 non-null  int64  
 9   period              24271 non-null  int64  
 10  playoffs            24271 non-null  int64  
 11  season              24271 non-null  object 
 12  seconds_remaining   24271 non-null  int64  
 13  shot_distance       24271 non-null  int64  
 14  shot_made_flag      20285 non-null  float64
 15  shot_type           24271 non-null  object 
 16  shot_zone

In [152]:
df_dev = df_dev[data_cols]
df_dev.head()

Unnamed: 0,lat,lon,minutes_remaining,period,playoffs,shot_distance,shot_made_flag
0,33.9723,-118.1028,10,1,0,18,
1,34.0443,-118.4268,10,1,0,15,0.0
2,33.9093,-118.3708,7,1,0,16,1.0
3,33.8693,-118.1318,6,1,0,22,0.0
4,34.0443,-118.2698,6,2,0,0,1.0


In [153]:
df_dev.dropna(inplace=True)

In [154]:
df_dev.shape

(20285, 7)

In [155]:
df_dev.head()

Unnamed: 0,lat,lon,minutes_remaining,period,playoffs,shot_distance,shot_made_flag
1,34.0443,-118.4268,10,1,0,15,0.0
2,33.9093,-118.3708,7,1,0,16,1.0
3,33.8693,-118.1318,6,1,0,22,0.0
4,34.0443,-118.2698,6,2,0,0,1.0
5,34.0553,-118.4148,9,3,0,14,0.0


In [165]:
from sklearn.model_selection import train_test_split
# Criar fluxo mlflow
# executa mlflow.db -> mlflow ui --backend-store-uri sqlite///mlruns.db
with mlflow.start_run(experiment_id=experiment_id,run_name='PreparacaoDados'):
    df_dev=df_dev[data_cols].copy()
    train_perc = 0.8

    xtrain, xtest, ytrain, ytest = train_test_split(
        df_dev[['lat','lon','minutes_remaining','period','playoffs','shot_distance']],
        df_dev['shot_made_flag'],
        train_size = train_perc,
        stratify=df_dev['shot_made_flag']
    )
    
    xtrain['shot_distance'] = ytrain
    xtest['shot_distance'] = ytest
    
    xtrain.to_parquet("D:\\infnet\\engenharia_ml\\kobe_dataset\\data\\processed\\base_train.parquet")
    xtest.to_parquet("D:\\infnet\\engenharia_ml\\kobe_dataset\\data\\processed\\base_test.parquet")
    
    df_dev.to_parquet("D:\\infnet\\engenharia_ml\\kobe_dataset\\data\\processed\\data_filtered.parquet")

    mlflow.log_params({
        'perc_test': 1-train_perc,
        'colunas_filtro': data_cols
    })
    mlflow.log_metrics({
        'treino': xtrain.shape[0],
        'teste': xtest.shape[0]
        })

In [169]:
# Treinamento
exp = pc.setup(
    data=xtrain,
    target=xtest,
    test_data=xtest,
    normalize=True,
    log_experiment=False
)
list_models = exp.compare_models(['lr','dt'], n_select=2,sort='f1')
list_models

Unnamed: 0,Description,Value
0,Session id,3778
1,Target,shot_distance
2,Target type,Binary
3,Original data shape,"(20285, 6)"
4,Transformed data shape,"(20285, 6)"
5,Transformed train set shape,"(16228, 6)"
6,Transformed test set shape,"(4057, 6)"
7,Numeric features,5
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dt,Decision Tree Classifier,0.5319,0.5017,0.5768,0.5085,0.5404,0.0672,0.0678,0.026
lr,Logistic Regression,0.5566,0.5669,0.5053,0.5378,0.521,0.109,0.1092,0.641


  master_display_.apply(


[DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=None, max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_samples_leaf=1,
                        min_samples_split=2, min_weight_fraction_leaf=0.0,
                        monotonic_cst=None, random_state=3778, splitter='best'),
 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=1000,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=3778, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False)]

In [176]:
def plot_parameter_validation_curve(X,Y,param_name,grid_search,model,model_name_scoring,logx):
    print('Parameter:', param_name)
    print('GridSearch:', grid_search[param_name])
    print('Scoring:', scoring)
    plt.figure(figsize=(6,4))
    train_scores,test_scores=validation_curve(model,
                                             X=X,
                                             y=Y,
                                             param_name=param_name)

'Validation Curve.png'

In [175]:
import os
from sklearn.metrics import log_loss,f1_score

yhat_test=exp.predict_model(list_models[0])

 mlflow.log_metrics({
     'lr_log_loss':log_loss(yhat_test.xtest, yhat_test.prediction_label),
     'lr_f1':log_loss(yhat_test.xtest, yhat_test.prediction_label),
 })

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Decision Tree Classifier,0.5368,0.5167,0.5792,0.5133,0.5443,0.0769,0.0775


Unnamed: 0,lat,lon,minutes_remaining,period,playoffs,shot_distance,prediction_label,prediction_score
25685,33.908298,-118.365799,9,1,0,1.0,0,1.0000
6396,33.847301,-118.328796,9,3,0,0.0,1,1.0000
4791,34.024300,-118.156799,6,4,0,0.0,0,1.0000
18758,34.035301,-118.254799,7,3,0,1.0,1,1.0000
99,34.035301,-118.230797,11,3,0,0.0,0,1.0000
...,...,...,...,...,...,...,...,...
8451,34.044300,-118.152802,4,1,0,1.0,1,1.0000
3094,34.044300,-118.269798,4,1,0,1.0,1,0.6667
30343,33.940300,-118.268799,9,4,1,1.0,1,1.0000
2340,34.029301,-118.401802,8,3,0,0.0,0,1.0000
