In [None]:
import os
import re
import random
import pandas as pd
from collections import Counter
from sklearn.metrics import f1_score

In [None]:
from typing import List, Tuple

In [None]:
from {{package_name}} import utils
from {{package_name}}.models_training import utils_models
from {{package_name}}.models_training.model_tfidf_svm import ModelTfidfSvm 

In [None]:
def text_to_sentence(text: str, min_sentence_size: int, min_sentence_word: int) -> List[str]:
    '''Transforms a text in sentences.

    Args:
        text (str) : The text to cut in sentences
        min_sentence_size (int) : The minimal number of characters in a sentence for it to be considered a sentence
        min_sentence_word (int) : The minimal number of words in a sentence for it to be considered a sentence
    Returns:
        list : A list of sentence
    
    '''
    text = re.sub(r'\s',' ', text)
    # Changes all 'strong' punctuations to a period
    text = re.sub(r'\!', r'.', text)
    text = re.sub(r'\?', r'.', text)
    text = re.sub(r'\;', r'.', text)
    # Get rid of superfluous whitespaces
    text = re.sub(' +', ' ', text)
    list_sentences = text.split('.')
    list_sentences = [sentence for sentence in list_sentences if len(sentence) >= min_sentence_size]
    list_sentences = [sentence for sentence in list_sentences if len(sentence.split(' ')) >= min_sentence_word]
    return list_sentences

In [None]:
# On charge les textes
data_path = utils.get_data_path()
df_texts = pd.read_csv(os.path.join(data_path, 'texts.csv'), sep='|')

In [None]:
min_sentence_size = 50
min_sentence_word = 10

In [None]:
# On constitue les phrases
list_phrases = []
for index, row in df_texts.iterrows():
    text = row['text']
    author = row['author']
    book = row['book']
    sentences = text_to_sentence(text, min_sentence_size, min_sentence_word)
    list_phrases = list_phrases+[(sentence, author, book) for sentence in sentences]
df_phrases = pd.DataFrame(list_phrases, columns=['sentence', 'author', 'book'])
set_author = set(df_texts['author'])

# On regarde la répartition des phrases par auteur
df_phrases.value_counts('author')/len(df_phrases)

In [None]:
# On mélange le dataset
df_to_split = df_phrases.sample(frac=1)

# On sélectionne un livre par auteur qui sera dans l'ensemble de validation
dict_books_to_valid = {}
for author in set_author:
    set_books = set(df_texts[df_texts['author']==author]['book'])
    dict_books_to_valid[author] = random.sample(set_books, k=1)[0]
df_valid = df_to_split[df_to_split['book'].isin(dict_books_to_valid.values())].copy()

# On sélectionne un autre livre par auteur qui sera dans l'ensemble de test
dict_books_to_test = {}
for author in set_author:
    set_books = set(df_texts[df_texts['author']==author]['book'])
    set_books.remove(dict_books_to_valid[author])
    dict_books_to_test[author] = random.sample(set_books, k=1)[0]
df_test = df_to_split[df_to_split['book'].isin(dict_books_to_test.values())].copy()

# Tout le reste est dans l'ensemble d'entrainement
df_train = df_to_split.copy()
df_train = df_train[~df_train['book'].isin(dict_books_to_valid.values())]
df_train = df_train[~df_train['book'].isin(dict_books_to_test.values())]

# On sauvegarde les datasets
utils.to_csv(df_train, os.path.join(data_path, 'dataset_texts_train.csv'))
utils.to_csv(df_valid, os.path.join(data_path, 'dataset_texts_valid.csv'))
utils.to_csv(df_test, os.path.join(data_path, 'dataset_texts_test.csv'))

# On regarde la répartition des phrases par auteur dans l'ensemble d'entrainement
df_train.value_counts('author')/len(df_train)

In [None]:
# On fait un léger tuning
dict_result = {}
count =0
for ngram_range in [(1, 1), (1, 2)]:
    for C in [0.1, 0.5, 1, 2]:
        model = ModelTfidfSvm(tfidf_params = {'ngram_range': ngram_range}, svc_params={'C':C})
        model.fit(df_train['sentence'], df_train['author'], x_valid=df_valid['sentence'], y_valid=df_valid['author'])
        df_train['pred'] = model.predict(df_train['sentence'])
        df_valid['pred'] = model.predict(df_valid['sentence'])
        score_train = f1_score(df_train['author'], df_train['pred'], average='macro')
        score_val = f1_score(df_valid['author'], df_valid['pred'], average='macro')
        dict_tmp = {'score_train':round(score_train, 5), 'score_val':round(score_val, 5), 'ngram_range':ngram_range, 'C':C}
        dict_result[count] = dict_tmp.copy()
        count += 1
        print(dict_tmp)

In [None]:
# On sélectionne le meilleur modèle
model = ModelTfidfSvm(tfidf_params = {'ngram_range': (1, 2)}, svc_params={'C':1})
model.fit(df_train['sentence'], df_train['author'], x_valid=df_valid['sentence'], y_valid=df_valid['author'])

In [None]:
# On prédit sur le test
df_test['pred'] = model.predict(df_test['sentence'])
score = f1_score(df_test['author'], df_test['pred'], average='macro')
print(score)

In [None]:
def predict_author(text: str, model, min_sentence_size: int, min_sentence_word: int, perc_sample: float=1.0) -> Tuple[str, dict, int]:
    '''Predicts the author of a text.
    
    Args:
        text (str) : The text whose author we want to predict
        min_sentence_size (int) : The minimal number of characters in a sentence for it to be considered a sentence
        min_sentence_word (int) : The minimal number of words in a sentence for it to be considered a sentence
        perc_sample (float): The percentage of sentence of the text we consider
    Returns:
        tuple :
            str : The predicted author
            dict : The percentage of sentences attributed to each author
            int : The number of sentences in the text
    '''
    # Cut the text in sentences
    sentences = text_to_sentence(text, min_sentence_size, min_sentence_word)
    sentences = random.sample(sentences, k=int(perc_sample*len(sentences)))
    # For each sentence, predict an author. Gives the number of sentences predicted for each author
    counter = dict(Counter(list(model.predict(sentences))))
    # The author with the highest number of sentences
    author = max(counter, key=counter.get)
    # Calculates a percentage of sentences instead of raw numbers
    count_sentences = [(key, round(value/len(sentences), 3)) for key, value in counter.items()]
    count_sentences = sorted(count_sentences, key=lambda x:x[1], reverse=True)
    return author, count_sentences, len(sentences)

In [None]:
dict_result = {}
# Pour chaque auteur
for author, book in dict_books_to_test.items():
    # Récupère le texte
    row = df_texts[df_texts['book']==book]
    text = row.iloc[0]['text']
    length = len(text)
    # Prédit l'auteur
    prediction, counter, nb_sentences = predict_author(text, model, min_sentence_size, min_sentence_word, perc_sample=1.0)
    # Enregistre le résultat
    dict_result[author] = {'prediction': prediction, 'counter': counter, 'length': len(text), 'nb_sentences':nb_sentences}
# Vérifie si le résultat est bon pour tous les auteurs
print({key==value['prediction'] for key, value in dict_result.items()})
# Montre les résultats
dict_result