In [None]:
import pandas as pd

from nltk.tokenize import word_tokenize
from nltk.test.gensim_fixt import setup_module
setup_module()

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (precision_score, recall_score,
    precision_recall_curve)

In [None]:
dataset = pd.read_csv('../data/preprocessed_dataset.csv', sep=';')

In [None]:
dataset.head()

In [None]:
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(dataset['clean_tokens'].drop_duplicates().to_list())]
model = Doc2Vec(tagged_data, vector_size=20, window=2, min_count=1, epochs=100)

In [None]:
documents_features = dataset['clean_text'].apply(lambda document: model.infer_vector(word_tokenize(document)))
query_features = dataset['query'].apply(lambda query: model.infer_vector(word_tokenize(query)))

In [None]:
features = documents_features.combine(query_features, func=lambda x, y: x.tolist() + y.tolist())
features = pd.DataFrame(features.apply(pd.Series))
label = dataset['rank']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, label, train_size=0.6)

In [None]:
ranking_model = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.4)
ranking_model = ranking_model.fit(X_train, y_train)

In [None]:
# All probabilities for class 1 are below 0.5, so we decided to select a
# threshold based on the predicted probabilities
y_proba = ranking_model.predict_proba(X_test)
y_proba_threshold = y_proba[:, 1] >= 0.296

In [None]:
y_test.value_counts()

In [None]:
print(
f'''
Precision: {precision_score(y_true=y_test, y_pred=y_proba_threshold)}
Recall: {recall_score(y_true=y_test, y_pred=y_proba_threshold)}
'''
)