This notebook is for the genre prediction task.

# Importation

In [7]:
from tqdm import tqdm
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from skmultilearn.problem_transform import BinaryRelevance, ClassifierChain
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from skmultilearn.adapt import MLkNN
from scipy.sparse import csr_matrix, lil_matrix

In [8]:
#TODO change here the path to your data folder 
data_path='./data'

In [9]:
#Importing the X and y
with open(data_path+'/X.pkl', 'rb') as f:
    X=pickle.load(f)

with open(data_path+'/y.pkl', 'rb') as f:
    y=pickle.load(f)

In [10]:
#Building our train test and validation set
X_pretrain, X_test, y_pretrain, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_pretrain, y_pretrain, test_size=0.2, random_state=42)

# Doc2Vec

In [11]:
# tagged data: each doc number is a label
tagged_data = [TaggedDocument(words=doc,
                              tags=[str(i)]) for i, doc in enumerate(X_train)]
# train the Doc2vec model
model = Doc2Vec(vector_size=20,
                min_count=2, epochs=50)
model.build_vocab(tagged_data)
model.train(tagged_data,
            total_examples=model.corpus_count,
            epochs=model.epochs)

KeyboardInterrupt: 

In [None]:
# get the document vectors
X_train_vec = [model.infer_vector(doc) for doc in tqdm(X_train)]
X_valid_vec = [model.infer_vector(doc) for doc in tqdm(X_valid)]
X_test_vec = [model.infer_vector(doc) for doc in tqdm(X_test)]

In [None]:
doc2Vec = Doc2Vec.pretrained("doc2vec_gigaword_wiki_300", "en")\
.setInputCols("cleanedToken")\
.setOutputCol("sentence_embeddings")

# Model

In [None]:
def relative_difference(pred, true):
    """TODO change to confusion matrix + plot heatmap with seaborn
    Our performance measure is the average of the difference per document 
    between the predicted labels matrix and the true labels matrix, divided by 
    the number of genres for this document """
    return np.mean((np.sum(pred-true, axis=1)/np.sum(true, axis=1)))

### Without doc2vec pre-processing

In [7]:
def dummy(doc):
    return doc

def preprocess_train_valid(X_train, X_valid):
    vectorizer = CountVectorizer(
            tokenizer=dummy,
            preprocessor=dummy,
        ) 
    X_train_new=vectorizer.fit_transform(X_train)
    X_valid_new=vectorizer.transform(X_valid)
    return (X_train_new, X_valid_new)

In [8]:
X_train, X_valid=preprocess_train_valid(X_train, X_valid)



In [9]:
# Binary relevance multi-label classifier with a gaussian naive bayes base classifier
classifier = BinaryRelevance(GaussianNB())
# train
classifier.fit(X_train, y_train)
# predict
predictions = classifier.predict(X_valid)
# accuracy
print("Accuracy = ", accuracy_score(y_valid,predictions))

### With doc2vec pre-processing

In [53]:
# Binary relevance multi-label classifier with a gaussian naive bayes base classifier
br = BinaryRelevance(GaussianNB())
# train
br.fit(X_train_vec, y_train)
# predict
pred_br = br.predict(X_valid_vec)

In [71]:
# initialize classifier chains multi-label classifier
cc = ClassifierChain(LogisticRegression())
# Training logistic regression model on train data
cc.fit(X_train_vec, y_train.iloc[:, :-1])
# predict
pred_cc = cc.predict(X_test_vec)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [84]:
#ML-KNN which is derived from the traditional K-nearest neighbor (KNN) algorithm
mlk = MLkNN()
# train
mlk.fit(lil_matrix(X_train_vec).toarray(), lil_matrix(y_train).toarray())
# predict
pred_mlk = mlk.predict(lil_matrix(X_test_vec).toarray())

TypeError: NearestNeighbors.__init__() takes 1 positional argument but 2 were given

In [86]:
#Accuracies on validation set
print("Relative Difference for Binary Relevance = ", accuracy_measure(pred_br.A, y_test.to_numpy()))
print("Relative Difference for Classifier Chain = ", accuracy_measure(np.append(pred_cc.A, [[0]]*201, axis=1), y_test.to_numpy()))

Relative Difference for Binary Relevance =  2.693458110435722
Relative Difference for Classifier Chain =  -0.018697720190257512
