# Demo Train

This notebook is used to train a model that can answer questions based on dataset and embeddings files.

In [27]:
# setup environment
import os
os.environ["PROJECT_ROOT"] = os.path.dirname(os.getcwd())

In [28]:
from prepareutils.Embeddings import embeddings
from prepareutils.Dataset import dataset
from commons.OpenAIClient import openaiClient
from sklearn.linear_model import LogisticRegression

# load dataset and embeddings
qaDataset = dataset.loadDataset()
questionEmbeddings, answerEmbeddings, labels = embeddings.loadEmbeddings()

# combine both embeddings
X = questionEmbeddings + answerEmbeddings
y = labels

In [29]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
clf = LogisticRegression(solver='lbfgs', random_state=42).fit(X, y)

In [30]:
clf.score(X, y)

0.9813084112149533

In [31]:
def ask(question):
    questionEmbedding = openaiClient.generateEmbeddings([question])[0]
    answerIndex = clf.predict([questionEmbedding]).item()
    qa = qaDataset[answerIndex]
    return qa

In [32]:
ask("when pele died?")

{'question': 'When did Edson Arantes do Nascimento die?',
 'answer': 'He died on 29 December 2022.'}

In [33]:
ask('what is your favorite Pelé memory?')

{'question': "What was Pelé's most memorable goal?",
 'answer': 'He states that his most memorable goal was scored at the Estádio Rua Javari on a Campeonato Paulista match.'}

In [34]:
ask('what did Pelé worked before becoming a football player?')

{'question': "What was Pelé's job when he worked in tea shops?",
 'answer': 'He worked as a servant.'}