# Demo Train

This notebook is used to train a model that can answer questions based on dataset and embeddings files.

In [1]:
# setup environment
import os
import numpy as np
os.environ["PROJECT_ROOT"] = os.path.dirname(os.getcwd())

In [2]:
from prepareutils.Embeddings import embeddings
from prepareutils.Dataset import dataset
from commons.OpenAIClient import openaiClient
from sklearn.linear_model import LogisticRegression

# load dataset and embeddings
qaDataset = dataset.loadDataset()
embeddings.loadEmbeddings()

# combine both embeddings
# noiseMin, noiseMax (-0.000003) gives 0.974
# noiseMin, noiseMax (-0.00003) gives 0.978
# noiseMin, noiseMax (-0.0003) gives 0.978
# noiseMin, noiseMax (-0.0001) gives 0.
x, y = embeddings.getAsXy(numberOfAugmentations=15, noiseMin=-0.00001, noiseMax=0.00001)
#X = questionEmbeddings + answerEmbeddings
#y = labels

x, y shapes:  (251, 1536) (251,)
new x, y shapes:  (3951, 1536) (3951,)


In [3]:
# Test which solver is the best
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, shuffle=True)
# newton-cg is omitted because the model never converges
solvers = ['lbfgs', 'liblinear', 'sag', 'saga']
maxIters = [150, 100, 100, 100]

# store the best solver here
clf = None
clfSolver = None
clfScore = 0

# train and return a single clf
def trainAndScore(solver, maxIter):
    currentClf = LogisticRegression(solver=solver, random_state=42, max_iter=maxIter).fit(x_train, y_train)
    # no augmentation = 0.12
    # with augmentation=4 = 0.81
    # with augmentation=6 = 0.89
    # with augmentation=12 = 0.96
    # with augmentation=15 = 0.979
    # with augmentation=15 = 0.981
    # with augmentation=20 = 0.974
    currentScore = currentClf.score(x_test, y_test)
    print(f"Solver: {solver}, score: {currentScore}")
    return currentClf, currentScore

# finds the best solver
def searchTheBestSolver(solvers, maxIters):
    for solver, maxIter in zip(solvers, maxIters):
        currentClf, currentScore = trainAndScore(solver, maxIter)
        if currentScore > clfScore:
            clf = currentClf
            clfSolver = solver
            clfScore = currentScore

# lbfgs always wins, so I disabled this
# searchTheBestSolver(solvers, maxIters)
clfSolver = 'lbfgs'
clf, clfScore = trainAndScore(clfSolver, 150)

print(f"Best solver: {clfSolver}")

Solver: lbfgs, score: 1.0
Best solver: lbfgs


In [21]:
import joblib
import os

# save the model
joblib.dump(clf, f"{os.environ['PROJECT_ROOT']}/io/generated/model-jupyter.sklearn")
# load the saved model
clf = joblib.load(f"{os.environ['PROJECT_ROOT']}/io/generated/model-jupyter.sklearn")

In [22]:
def ask(question):
    questionEmbedding = openaiClient.generateEmbeddings([question])[0]
    # an array with numbers - each number is the probability in that index
    predictPerIndexes = clf.predict_proba([questionEmbedding])
    # a list of the best predictions - do not use it to get the index, it won't work
    bestProbas = np.sort(predictPerIndexes[0])[-5:][::-1]
    worstProbas = np.sort(predictPerIndexes[0])[:5]
    #bestOne = predictPerIndexes[0][np.argmax(predictPerIndexes)]
    #answerIndex = np.argmax(predictPerIndexes)
    #qa = qaDataset[answerIndex]['answer']
    #topic = qaDataset[answerIndex]['topic']
    print(f"{question}")
    print("---")
    probabilityDiff = worstProbas[0] / bestProbas[0]
    bestProbability = bestProbas[0]
    print(f"probabilityDiff: {probabilityDiff:.2f}")
    print(f"bestProbability: {bestProbability:.2f}")
    for probabilityPerIndex in bestProbas:
        probaAnswerIndex = predictPerIndexes[0].tolist().index(probabilityPerIndex)
        confidenceOk = probabilityPerIndex > 0.08
        leadDiff = bestProbability - probabilityPerIndex
        print("---")
        print(f"confidence: {probabilityPerIndex:.4f} -{leadDiff:.2f} - {'OK' if confidenceOk else 'FAIL'}")
        probaQuestions = qaDataset[probaAnswerIndex]['questions'][:3]
        print(f"topic: {qaDataset[probaAnswerIndex]['topic']} - questions: {len(probaQuestions)} {probaQuestions}")
        print(f"answer:     {qaDataset[probaAnswerIndex]['answer']}")
    print("")
    return None

In [23]:
ask('How many goals did Pelé score?')

How many goals did Pelé score?
---
probabilityDiff: 0.09
bestProbability: 0.06
---
confidence: 0.0623 -0.00 - FAIL
topic: number of goals - questions: 3 ['How many goals did Pelé score?', 'Did Pelé score over 1,000 goals?', 'What was the total number of goals Pelé scored in his career?']
answer:     Pelé scored a total of 1,279 goals in his career, recognized by the Guinness World Record.
---
confidence: 0.0392 -0.02 - FAIL
topic: specific skill - goalscoring record - questions: 3 ['How many goals did Pelé score throughout his career?', "What was Pelé's goalscoring record?", 'How did Pelé achieve the Guinness World Record?']
answer:     Pelé scored 1,279 goals in 1,363 games, which earned him the Guinness World Record for goalscoring.
---
confidence: 0.0359 -0.03 - FAIL
topic: memorable goals - questions: 3 ['What was the most memorable goal of Pelé?', 'When did Pelé score the goal de placa?', 'Against which team did Pelé score the goal de placa?']
answer:     Pelé's most memorable goa

In [24]:
# OK
print("!!! OK !!!!")
ask('How is the wheather?')

!!! OK !!!!
How is the wheather?
---
probabilityDiff: 0.01
bestProbability: 0.16
---
confidence: 0.1630 -0.00 - OK
topic: unknown - questions: 3 ['Who wrote the Harry Potter series', 'How many planets are there in the solar system?', 'Which continent is Antarctica part of?']
answer:     I'm sorry, I have limited knowledge and just know things about the Rei Pelé.
---
confidence: 0.0471 -0.12 - FAIL
topic: Nigerian Civil War - questions: 2 ['What was the Nigerian Civil War?', 'Did any football game cause a ceasefire during the Nigerian Civil War?']
answer:     The Nigerian Civil War was a conflict from 1967-1970 between Nigerian government forces and the new state of Biafra seeking to secede from Nigeria; among the many events one of the ceasefire agreements was made in 1969 for a 48-hours period in which both sides are willing to watch Pelé play an exhibition game.
---
confidence: 0.0395 -0.12 - FAIL
topic: specific tournament - Campeonato Paulista - questions: 3 ['What is the Campeonat

In [12]:
# OK
print("!!! OK !!!!")
ask('what did Pelé worked before becoming a football player?')

!!! OK !!!!
what did Pelé worked before becoming a football player?
---
probabilityDiff: 0.13
bestProbability: 0.04
---
confidence: 0.0443 -0.00 - FAIL
topic: early life - questions: 3 ['Where did Pelé grow up?', 'Was Pelé born into a wealthy family?', 'How did Pelé make extra money growing up?']
answer:     Pelé grew up in poverty in Bauru and played for several amateur teams in his youth, winning two São Paulo state youth championships and a futsal championship.
---
confidence: 0.0401 -0.00 - FAIL
topic: earnings - questions: 3 ['Was Pelé the highest-paid athlete in the world during his playing days?', 'How much did Pelé earn during his playing days?', 'What did Pelé do after retiring from football?']
answer:     Pelé was for a period the best-paid athlete in the world during his playing days. After retiring in 1977, he became a worldwide ambassador for football, and ventured into acting and commercial endeavors.
---
confidence: 0.0381 -0.01 - FAIL
topic: Futsal - questions: 3 ['What

In [8]:
# OK
print("!!! OK !!!!")
ask("when pele died?")

!!! OK !!!!
when pele died?
---
probabilityDiff: 0.06
bestProbability: 0.10
---
confidence: 0.0967 -0.00 - OK
topic: deathdate - questions: 3 ['When did Pelé pass away?', "What was the date of Pelé's death?", 'Did Pelé die? When did it happen?']
answer:     Pelé passed away on December 29, 2022.
---
confidence: 0.0343 -0.06 - FAIL
topic: unknown - questions: 3 ['Who wrote the Harry Potter series', 'How many planets are there in the solar system?', 'Which continent is Antarctica part of?']
answer:     I'm sorry, I have limited knowledge and just know things about the Rei Pelé.
---
confidence: 0.0270 -0.07 - FAIL
topic: Early career - questions: 3 ['When did Pelé become a top scorer in the league?', 'At what age did Pelé become a top scorer in the league?', 'At what age did Pelé get called up to the Brazil national team?']
answer:     Pelé became the top scorer in the league at the age of 16 and ten months after signing professionally, he was called up to the Brazil national team. Wealth

In [9]:
# OK
print("!!! OK !!!!")
ask('what is your favorite Pelé memory?')

!!! OK !!!!
what is your favorite Pelé memory?
---
probabilityDiff: 0.12
bestProbability: 0.06
---
confidence: 0.0565 -0.00 - FAIL
topic: memorable goals - questions: 3 ['What was the most memorable goal of Pelé?', 'When did Pelé score the goal de placa?', 'Against which team did Pelé score the goal de placa?']
answer:     Pelé's most memorable goal was scored against São Paulo rival Clube Atlético Juventus on 2 August 1959, but it was not recorded, so a computer animation was later made of it. He also scored the goal de placa against Fluminense at the Maracanã in March 1961, which is considered the most beautiful in the history of Maracanã, and a plaque was dedicated to it.
---
confidence: 0.0336 -0.02 - FAIL
topic: unknown - questions: 3 ['Who wrote the Harry Potter series', 'How many planets are there in the solar system?', 'Which continent is Antarctica part of?']
answer:     I'm sorry, I have limited knowledge and just know things about the Rei Pelé.
---
confidence: 0.0281 -0.03 -

In [10]:
# OK
print("!!! OK !!!!")
ask('How many goals did Pelé score in his career?')

!!! OK !!!!
How many goals did Pelé score in his career?
---
probabilityDiff: 0.09
bestProbability: 0.07
---
confidence: 0.0662 -0.00 - FAIL
topic: number of goals - questions: 3 ['How many goals did Pelé score?', 'Did Pelé score over 1,000 goals?', 'What was the total number of goals Pelé scored in his career?']
answer:     Pelé scored a total of 1,279 goals in his career, recognized by the Guinness World Record.
---
confidence: 0.0414 -0.02 - FAIL
topic: specific skill - goalscoring record - questions: 3 ['How many goals did Pelé score throughout his career?', "What was Pelé's goalscoring record?", 'How did Pelé achieve the Guinness World Record?']
answer:     Pelé scored 1,279 goals in 1,363 games, which earned him the Guinness World Record for goalscoring.
---
confidence: 0.0305 -0.04 - FAIL
topic: retirement and records - questions: 3 ['When did Pelé retire from professional soccer?', 'How many goals did Pelé score in his professional career?', 'What record did Pelé hold in the Gu