In [28]:
import numpy as np 
import pandas as pd 

from sklearn.metrics import cohen_kappa_score, accuracy_score,balanced_accuracy_score

from plotly import express as px

from UA_MDM_LDI_II.tutoriales.utils import plot_confusion_matrix, get_artifact_filename

import os

from json import loads

from joblib import load, dump

import optuna
from optuna.artifacts import FileSystemArtifactStore, upload_artifact

In [29]:
# Paths
BASE_DIR = '../'
PATH_TO_TRAIN = os.path.join(BASE_DIR, "input/petfinder-adoption-prediction/train/train.csv")
PATH_TO_TEMP_FILES = os.path.join(BASE_DIR, "work/optuna_temp_artifacts")
PATH_TO_OPTUNA_ARTIFACTS = os.path.join(BASE_DIR, "work/optuna_artifacts")

In [30]:
study_lgb = optuna.create_study(direction='maximize',
                            storage="sqlite:///../work/db.sqlite3",  # Specify the storage URL here.
                            study_name="04 - LGB Multiclass CV",
                            load_if_exists = True)


lgb_dataset = load(os.path.join(PATH_TO_OPTUNA_ARTIFACTS,get_artifact_filename(study_lgb,'test')))

[I 2024-07-02 20:51:03,143] Using an existing study with name '04 - LGB Multiclass CV' instead of creating a new one.


In [31]:
lgb_dataset

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Quantity,Fee,State,RescuerID,VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed,pred
14696,1,Dione & Elora,1,307,307,2,1,0,0,2,...,2,0,41327,61b07b54adb97d4b5f3c2dec06a9943b,0,Dione and Elora are puppies of Rambo. Both are...,8f20e24ef,9.0,4,"[0.08666475096475047, 0.878111520198947, 1.906..."
14823,1,Har-nee,24,103,307,2,1,2,4,2,...,1,0,41330,9cb2e5a10e24e0b09942013b8434c81f,0,We found Har-nee with a swollen and almost sev...,2d72ef0c4,2.0,4,"[0.08183787703801709, 0.9223673968501744, 1.05..."
2838,1,The Gorgeous 5 Beauties,2,307,0,2,2,7,0,2,...,5,0,41326,5c398b2e18b16f0db83c53e682eada42,0,Theses 5 very adorably cute white female puppi...,44cd12263,5.0,4,"[0.0406905043572476, 0.5592956712686047, 1.556..."
1848,2,Mochi,1,265,0,1,2,0,0,1,...,1,0,41401,6905e4fbe5658eef5f560b814898a5ee,2,Hello! My name is Mochi. I was rescued from a ...,210c4a637,6.0,2,"[0.17210865722003071, 1.2575731255969187, 1.78..."
669,2,Nala & Peach,9,266,266,2,2,4,6,2,...,2,0,41326,803457cd3660dda694086b51a11a5a39,0,Nala is a cat that's been born with 7 fingers ...,21493e6ea,8.0,4,"[0.0687026219625729, 0.6396451364675575, 1.460..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996,2,Anak Nanya,8,266,0,3,1,2,0,2,...,3,50,41326,f14c2cfebbbafbc9ed1f500d082f3ec3,0,they r ol so cute :) it juz a matter me dun hv...,35f9818a7,14.0,4,"[0.06707134412708712, 0.6867559121795743, 1.15..."
12222,1,Poor Baby,3,307,0,1,5,0,0,2,...,1,0,41401,500c48db7b281eabec3c293160f4a71c,0,On behalf of Exotica Pets Healthy puppy availa...,46e25aa2b,2.0,1,"[0.058994130112302284, 1.4751886366695883, 1.3..."
10538,2,No Name,1,265,0,2,1,6,0,1,...,1,0,41401,ac9a633cf51a70f4a9842e6e1ba91fc9,0,sy jumpa kitten ni mengiau2 kat playground. ra...,d3692d2b2,2.0,1,"[0.1996945436640809, 2.21013620299541, 1.22173..."
11062,1,Pipi,1,307,0,2,1,5,7,2,...,6,0,41326,3ef66c1034bb6dc31314845457079483,0,"Health, cute and active puppies.",3c43b7541,1.0,4,"[0.08085562277686253, 0.850968034981315, 1.742..."


In [32]:
MODEL_NAME = '06 Bert'
MODEL_VERSION = '1.0.1'

study_bert = optuna.create_study(direction='maximize',
                            storage="sqlite:///../work/db.sqlite3",  # Specify the storage URL here.
                            study_name=f'{MODEL_NAME}_{MODEL_VERSION}',
                            load_if_exists = True)

bert_dataset = load(os.path.join(PATH_TO_OPTUNA_ARTIFACTS,get_artifact_filename(study_bert,'test')))

[I 2024-07-02 20:51:04,802] Using an existing study with name '06 Bert_1.0.1' instead of creating a new one.


In [33]:
bert_dataset

Unnamed: 0,PetID,pred,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,...,Health,Quantity,Fee,State,RescuerID,VideoAmt,Description,PhotoAmt,AdoptionSpeed,labels
0,8e76c8e39,"[0.0013099476, 0.0024839537, 0.83914113, 0.048...",2,Kali,3,264,0,2,1,2,...,1,1,50,41326,a9caef3f98e67bfac9093cca79e20b93,0,Kali is a super playful kitten who is on the g...,2.0,1,1
1,6436c1a59,"[0.0032551226, 0.0013109609, 0.27809864, 0.043...",1,Godiva,12,307,0,2,2,7,...,1,1,0,41326,a042471e0f43f2cf707104a1a138a7df,0,Godiva was rescued in Serdang residential area...,7.0,2,2
2,988988d5b,"[0.00032985528, 0.04259599, 0.48512816, 0.4703...",2,Cikenet,3,266,0,1,2,7,...,1,1,0,41401,b8853c71b981104f1ef126e51387b616,0,"hello cikenets fans, i just wanna inform that ...",19.0,1,1
3,efbf1703a,"[0.001012655, 0.006620176, 0.7460893, 0.244253...",2,No Name,1,266,0,2,1,0,...,1,1,0,41326,2f846fb8f87a25678374e193559d83c9,0,"Just saved this kitten from the street, but i ...",2.0,2,2
4,543130f60,"[6.8802125e-05, 0.001439368, 0.9881124, 0.0094...",1,BoiBoi,24,307,0,1,5,7,...,1,5,0,41326,2147467fcd35e7a3bc23b9edcffc5702,0,Boiboi is rescued by my daughter 2 years ago f...,1.0,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2991,23874f644,"[0.00014980059, 0.02329869, 0.9697697, 0.00543...",1,Patch,8,307,0,2,2,7,...,1,1,0,41326,001e42763024f9d4abe31e79472b1827,0,Patch is for free adoption. If you want to ado...,2.0,3,3
2992,e7f7066b6,"[0.017603166, 0.33848128, 0.12005673, 0.494514...",1,Terry,24,179,307,1,2,3,...,1,1,0,41326,719987dce7aeb027fdfa91b480800199,0,been at my place for a while..am hoping to fin...,0.0,4,4
2993,36e7f8d83,"[0.00051314593, 0.001854559, 0.09163376, 0.903...",2,Pets + Strays : BlueEyed BlackWhite,1,266,0,2,5,6,...,1,1,0,41401,90569c3f7cb0af35cba5dac82c0ac9d7,0,1 month old white + grey kitten for adoption n...,1.0,3,3
2994,4d163b731,"[0.0018222176, 0.15038805, 0.8187083, 0.018344...",1,Snowy,6,195,0,2,1,7,...,1,1,0,41401,79309f4027f2fedb4349a298c69fe56f,0,ooooo,1.0,0,0


In [34]:
merged_datasets = lgb_dataset[['PetID', 'pred', 'AdoptionSpeed']].rename({'pred':'lgb_pred_score'},axis=1).merge(bert_dataset[['PetID', 'pred']].rename({'pred':'bert_pred_score'},axis=1),
                  on='PetID', how='outer')



merged_datasets['bert_pred_score'] = [np.zeros(5) if type(i) is float else  i for i in merged_datasets['bert_pred_score'] ]

In [35]:
merged_datasets

Unnamed: 0,PetID,lgb_pred_score,AdoptionSpeed,bert_pred_score
0,002230dea,"[0.17953245512077848, 1.5370255010913683, 1.87...",1,"[0.00012052871, 0.02271323, 0.9751661, 0.00168..."
1,0063f83c9,"[0.31385070416347927, 0.9413531297435047, 1.42...",1,"[0.0033665393, 0.11322637, 0.13855062, 0.73853..."
2,0073c33d0,"[0.034448279059874576, 0.8524923657157204, 1.6...",3,"[0.028002514, 0.052483853, 0.47461298, 0.42190..."
3,00bfa5da9,"[0.1044221352583703, 0.6287732143334426, 1.616...",4,"[0.00016035326, 0.00011455811, 0.00014174775, ..."
4,00c19f4fa,"[0.05758597966484946, 0.5872041351838008, 1.31...",2,"[3.9621285e-05, 0.0014406826, 0.9893489, 0.008..."
...,...,...,...,...
2994,ffa5c6c35,"[0.2595168461322954, 0.9332214533483751, 1.010...",4,"[0.0014198591, 0.0025184832, 0.042296257, 0.11..."
2995,ffd697903,"[0.23995192611547367, 1.155661854207091, 1.539...",3,"[0.0007647461, 0.015272322, 0.9000884, 0.08191..."
2996,ffe0f06ab,"[0.09952314394914771, 1.4011868358924444, 1.76...",2,"[0.00077634945, 0.008837888, 0.38415042, 0.605..."
2997,ffe5a0271,"[0.10262161330629418, 1.9471575933466534, 1.36...",3,"[0.00037448868, 0.004284962, 0.044694904, 0.23..."


In [36]:
merged_datasets['blend_pred_score'] = [r['lgb_pred_score']+r['bert_pred_score'] for i,r in merged_datasets.iterrows()]

In [37]:
merged_datasets['lgb_pred_score']

0       [0.17953245512077848, 1.5370255010913683, 1.87...
1       [0.31385070416347927, 0.9413531297435047, 1.42...
2       [0.034448279059874576, 0.8524923657157204, 1.6...
3       [0.1044221352583703, 0.6287732143334426, 1.616...
4       [0.05758597966484946, 0.5872041351838008, 1.31...
                              ...                        
2994    [0.2595168461322954, 0.9332214533483751, 1.010...
2995    [0.23995192611547367, 1.155661854207091, 1.539...
2996    [0.09952314394914771, 1.4011868358924444, 1.76...
2997    [0.10262161330629418, 1.9471575933466534, 1.36...
2998    [0.1127265853265893, 1.3740931869018997, 1.803...
Name: lgb_pred_score, Length: 2999, dtype: object

In [38]:
merged_datasets['lgb_pred'] = [r.argmax() for r in merged_datasets['lgb_pred_score']]
merged_datasets['bert_pred'] = [r.argmax() for r in merged_datasets['bert_pred_score']]
merged_datasets['blended_pred'] = [r.argmax() for r in merged_datasets['blend_pred_score']]

In [39]:
merged_datasets['lgb_pred'] = [r.argmax() for r in merged_datasets['lgb_pred_score']]
merged_datasets['bert_pred'] = [r.argmax() for r in merged_datasets['bert_pred_score']]
merged_datasets['blended_pred'] = [r.argmax() for r in merged_datasets['blend_pred_score']]

In [40]:
plot_confusion_matrix(merged_datasets['AdoptionSpeed'],
                      merged_datasets['lgb_pred'], 
                    title = 'LGB Model Kappa: ' + str(cohen_kappa_score(merged_datasets['AdoptionSpeed'],
                                                                    merged_datasets['lgb_pred'], 
                                                                    weights='quadratic')))

In [41]:
plot_confusion_matrix(merged_datasets['AdoptionSpeed'],
                      merged_datasets['bert_pred'], 
                    title = 'Bert Model Kappa: ' + str(cohen_kappa_score(merged_datasets['AdoptionSpeed'],
                                                                    merged_datasets['bert_pred'], 
                                                                    weights='quadratic')))



In [42]:
plot_confusion_matrix(merged_datasets['AdoptionSpeed'],
                      merged_datasets['blended_pred'], 
                    title = 'Blended Model Kappa: ' + str(cohen_kappa_score(merged_datasets['AdoptionSpeed'],
                                                                    merged_datasets['blended_pred'], 
                                                                    weights='quadratic')))
