# Kobe Bryant - Análise Preditiva de Acertos 

#### Importação das bibliotecas e dataset

In [1]:
import pandas as pd
import pycaret.classification as pc
import mlflow
import mlflow.sklearn
import os
import requests
from sklearn import linear_model, preprocessing, metrics, model_selection
from mlflow.models.signature import infer_signature
from mlflow.tracking import MlflowClient

In [2]:
# Muda o diretório de trabalho
os.chdir('/Users\Guilherme\OneDrive - Firjan\Estudo\Infnet\Módulo_2\Projeto\kobeb_shot_prediction\Code')

In [3]:
df = pd.read_csv('..\Data\kobe_dataset.csv', sep=',')
df.head()

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,playoffs,season,seconds_remaining,shot_distance,shot_made_flag,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,game_date,matchup,opponent,shot_id
0,Jump Shot,Jump Shot,10,20000012,33.9723,167,72,-118.1028,10,1,0,2000-01,27,18,,2PT Field Goal,Right Side(R),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,1
1,Jump Shot,Jump Shot,12,20000012,34.0443,-157,0,-118.4268,10,1,0,2000-01,22,15,0.0,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,2
2,Jump Shot,Jump Shot,35,20000012,33.9093,-101,135,-118.3708,7,1,0,2000-01,45,16,1.0,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,3
3,Jump Shot,Jump Shot,43,20000012,33.8693,138,175,-118.1318,6,1,0,2000-01,52,22,0.0,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,4
4,Driving Dunk Shot,Dunk,155,20000012,34.0443,0,0,-118.2698,6,2,0,2000-01,19,0,1.0,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,5


In [4]:
df_full = pd.read_csv('..\Data\kobe_dataset.csv', sep=',')
df = df_full.dropna(subset=['shot_made_flag'])
df.shape

(25697, 25)

In [5]:
registered_model_name = 'modelo_kobeb_shots'
min_precision = 0.6
model_version = -1 # recuperar a ultima versao
nexamples = 4

#### Preparação dos dados

In [6]:
# Configurar o sqlite como repositório
mlflow.set_tracking_uri("sqlite:///mlruns.db")

experiment_name = 'kobe_shot_prediction'
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None:
    experiment_id = mlflow.create_experiment(experiment_name)
    experiment = mlflow.get_experiment(experiment_id)
experiment_id = experiment.experiment_id

2022/04/24 10:16:48 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2022/04/24 10:16:48 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

In [7]:
# Run de Preparação de dados
# Paramentros: features,
# Metricas: SHAPE de cada base de dados, porcentagem de teste
# Artefatos: nenhum


features = ['lat','lon','minutes_remaining', 'period', 'playoffs', 'shot_distance']
target_col = 'shot_made_flag'
test_size = 0.2

with mlflow.start_run(experiment_id=experiment_id, run_name = 'PreparacaoDados'):
    
    df_full = pd.read_csv('..\Data\kobe_dataset.csv', sep=',')
    df = df_full.dropna(subset=['shot_made_flag'])
    df_2pt = df[df['shot_type'] == '2PT Field Goal'].copy()
    
        # Salvando os arquivos do dados filtrados para cestas de 2 pontos
    df_2pt.to_parquet('../Data/processed/data_filtered.parquet')
    
    
        # Separar parte para compor a base de operacao
    Y  = df_2pt[target_col]
    df_2pt = df_2pt[features]
    data_train, data_test, ytrain, ytest = model_selection.train_test_split(df_2pt, Y, test_size=0.2, stratify=Y)
    data_train = pd.merge(data_train, ytrain, left_index=True, right_index=True)
    data_test = pd.merge(data_test, ytest, left_index=True, right_index=True)
    
        # Salvando arquivos de treino e de teste
    data_train.to_parquet('../Data/operalization/data_train.parquet')
    data_test.to_parquet('../Data/operalization/data_test.parquet')
    
        # Base com arremessos de 3pt
    data_novelty = df[df['shot_type'] == '3PT Field Goal'].copy()
    data_novelty = data_novelty[features + [target_col]]
    
        # Salvando os arquivos do dados das cestas de 3 pontos
    data_novelty.to_parquet('../Data/novelty/data_novelty.parquet')
     
         # Log dos paramentros e métricas do modelo
    mlflow.log_param("features", features)
    mlflow.log_param("percent_teste", test_size)
    mlflow.log_metric("data_dev", data_train.shape[0])
    mlflow.log_metric("data_operation", data_test.shape[0])
    mlflow.log_metric("data_novelty", data_novelty.shape[0])

mlflow.end_run()

print('== Bases de Dados ==')
print(f'data_dev {data_train.shape}')
print(f'data_operation {data_test.shape}')
print(f'data_novelty {data_novelty.shape}')
print(f'Columns: {data_train.columns}')

== Bases de Dados ==
data_dev (16228, 7)
data_operation (4057, 7)
data_novelty (5412, 7)
Columns: Index(['lat', 'lon', 'minutes_remaining', 'period', 'playoffs',
       'shot_distance', 'shot_made_flag'],
      dtype='object')


#### Treinamento dos modelos

In [None]:
# Runs de setup
# Parametros: none
# Métricas: none
# Artefatos: none

data_train = pd.read_parquet('../Data/operalization/data_train.parquet')
data_test = pd.read_parquet('../Data/operalization/data_test.parquet')

reg = pc.setup(data=data_train,
                target=target_col,
                test_data=data_test,
                preprocess=True,
                normalize=True,
                log_experiment = True,
                log_plots = True,
                experiment_name = experiment_name,
                normalize_method='minmax',
                transformation=True,
                remove_multicollinearity=True,
                multicollinearity_threshold=0.9,
                fold_strategy='stratifiedkfold',
                fold = 5,
                silent = True,
                session_id=41
              )

pc.add_metric('logloss', 'Log Loss', metrics.log_loss)

# Log do run
classification_plots = ['auc',
                        'pr',
                        'confusion_matrix',
                        'threshold',
                        'learning',
                        'vc',
                        'feature']



while mlflow.active_run() != None:
    mlflow.end_run()

In [None]:
# Run de treinamento do modelo de regressão logística
# Parâmetros: probability_threshold, cross_validation
# Métricas: auto sklearn + logloss
# Artefatos: plots

probability_threshold = 0.6
cross_validation = True

with mlflow.start_run(experiment_id=experiment_id, run_name = 'Treinamento Lr'):
        
    model_name = 'lr'
    best_model = pc.create_model(model_name,
                                cross_validation = cross_validation, 
                                probability_threshold=probability_threshold)
    
    for plot_type in classification_plots:
            print('=> Aplicando plot ', plot_type)
            try:
                artifact = pc.plot_model(best_model, plot=plot_type, save=True, use_train_data=False)
                mlflow.log_artifact(artifact)
            except:
                print('=> Nao possivel plotar: ', plot_type )
                continue
    
    pc.save_model(best_model, f'./{str(best_model.classifier).split("(")[0]}') 

while mlflow.active_run() != None:
    mlflow.end_run()

In [None]:
# Run de seleção e treinamento do segundo modelo de classificação
# Parâmetros: probability_threshold, cross_validation
# Métricas: auto sklearn + logloss
# Artefatos: plots

probability_threshold = 0.6
cross_validation = True

with mlflow.start_run(experiment_id=experiment_id, run_name = 'Treinamento 2º modelo'):

    best_model = pc.compare_models(n_select = 1, sort='f1', include=['dt', 'svm'])

    for plot_type in classification_plots:
            print('=> Aplicando plot ', plot_type)
            try:
                artifact = pc.plot_model(best_model, plot=plot_type, save=True, use_train_data=False)
                mlflow.log_artifact(artifact)
            except:
                print('=> Nao possivel plotar: ', plot_type )
                continue
    
    pc.save_model(best_model, f'./{str(best_model).split("(")[0]}') 

while mlflow.active_run() != None:
    mlflow.end_run()

#### Aprovação do modelo

In [None]:
# Run de aprovação do modelo
# Parâmetros: min_precision
# Métricas: new_version, precision
# Artefatos: None

# Carrega o pipeline + model
model_pipe = pc.load_model(f'./LogisticRegression')

with mlflow.start_run(experiment_id=experiment_id, run_name = 'AprovacaoModelo'):
    #pred_holdout = pc.predict_model(best_model)
    pred_holdout = pc.predict_model(model_pipe)
    pr = metrics.precision_score(pred_holdout[target_col], pred_holdout['Label'], pos_label='1.0')
    if pr > min_precision:
        print(f'=> Aceito o modelo com precisão {pr} (min: {min_precision})')
        pred_holdout.to_parquet('modelo_kobebshot_teste.parquet')
        # Assinatura do Modelo Inferida pelo MLFlow
        model_features = list(data_train.drop(target_col, axis=1).columns)
        inf_signature = infer_signature(data_train[model_features], model_pipe.predict(data_train))
        # Exemplo de entrada para o MLmodel
        input_example = {x: data_train[x].values[:nexamples] for x in model_features}
        # Log do pipeline de modelagem do sklearn e registrar como uma nova versao
        mlflow.sklearn.log_model(
            sk_model=model_pipe,
            artifact_path="sklearn-model",
            registered_model_name=registered_model_name,
            signature = inf_signature,
            input_example = input_example
        )
        # Criacao do cliente do servico MLFlow e atualizacao versao modelo
        client = MlflowClient()
        if model_version == -1:
            model_version = client.get_latest_versions(registered_model_name)[-1].version
        # Registrar o modelo como staging
        client.transition_model_version_stage(
            name=registered_model_name,
            version=model_version, # Verificar com usuario qual versao
            stage="Staging"
        )
    else:
        print(f'=> Rejeitado o modelo com precisão {pr} (min: {min_precision})')
    
    # LOG DE PARAMETROS DO MODELO
    mlflow.log_param("precisao_minima", min_precision)
    
    # LOG DE METRICAS GLOBAIS
    mlflow.log_metric("new_version", model_version)
    mlflow.log_metric("precisao", pr)

while mlflow.active_run() != None:
    mlflow.end_run()

In [None]:
#client.delete_model_version(name=registered_model_name, version=2)

In [None]:
#client.delete_registered_model(name=registered_model_name)

In [None]:
#!mlflow ui --backend-store-uri sqlite:///mlruns.db --port 5005

In [None]:
os.environ['MLFLOW_TRACKING_URI'] = 'sqlite:///mlruns.db'
!mlflow models serve -m "models:/modelo_kobeb_shots/Staging" --no-conda -p 5005

In [None]:
host = 'localhost'
port = '5005'
url = f'http://{host}:{port}/invocations'
headers = {'Content-Type': 'application/json',}

data_novelty = pd.read_parquet('../Data/novelty/data_novelty.parquet')

http_data = data_novelty.drop(target_col,axis=1).to_json(orient='split')
r = requests.post(url=url, headers=headers, data=http_data)

#data_novelty.loc[:, 'operation_label'] = pd.read_json(r.text).values[:,0]

#data_novelty.to_parquet('modelo_kobebshot_operacao.parquet')