In [None]:
!python -m spacy download nb_core_news_sm > /dev/null

In [1]:
import pandas as pd
import spacy
from spacy.lang.nb.examples import sentences 
from nltk.tokenize import word_tokenize
import re
import string
from nltk.corpus import stopwords
import fasttext
from nltk import NaiveBayesClassifier
from nltk import classify

In [2]:
def filter_split_dataset(metadata_df = None):
    """Filter out nynorsk, and split into train and test datasets.
    
    :param metadata_df: pd.DataFrame object with metadata.
    
    :return: 2-tuple of train and test pd.DataFrames
    """
    if metadata_df is None:
        metadata_df = pd.read_json(f"data/metadata.json").T
        
    bokmål_mask = metadata_df.language == "nb"
    
    metadata_df = metadata_df[bokmål_mask]
    
    train_mask = metadata_df.split == "train"
    test_mask = metadata_df.split == "test"
        
    return metadata_df[train_mask], metadata_df[test_mask]
    

In [3]:
def process_documents(doc, lemmatizer = None, remove_newlines = False):
    """Tokenize and lemmatize.
    
    :param document: String.
    :param lemmatizer: Function Str -> iter(Str,)
    """
        
    if lemmatizer is None:
        lemmatizer = spacy.load("nb_core_news_sm")
    
    if remove_newlines:
        doc = re.sub('\n', '', doc)
    
    return lemmatizer(doc)

In [4]:
def clean_document(doc, stop_words, **process_kwargs):
    """Cleans up document, normalising words and removing stop_words.
    
    :param doc: Str
    :param stop_words: [Str] to exclude
    :param **process_kwargs: Passed to process_documents
    
    :return: [Str], list of tokens
    """
    text_list = []
    for token in process_documents(doc, **process_kwargs):
        if token.lemma_ not in stop_words:
            if token.lemma_ in string.punctuation:
                text_list.append(token.lemma_)
            else:
                text_list.append(f" {token.lemma_}")
    
    return text_list

In [5]:
def get_documents(metadata_df=None, path="data", processed=False, ret=["rating", "authors"]):
    """Get documents of a specific type.
    
    Filter out reviews in nynorsk.
    
    :param path: Str path to folder with metadata.json file, and test and train folders.
    :param dataset: Determines which type to look for. Either `train` or `test`.
    :param processed: Whether to look for already processed data or not.
    :param ret: Columns from each document to return alongside the document itself. [Rating and authors]
    
    :return: iter(Str,) of documents
    """
    
    if metadata_df is None:
        metadata_df = pd.read_json(f"{path}/metadata.json").T
    
    full_path = f"{path}/%s/%s{'p' if processed else ''}.txt"
    
    for (_, review) in metadata_df.iterrows():
        document = open(f"{path}/{review['split']}/{str(review['id']).zfill(6)}{'p' if processed else ''}.txt", "r").read()
        yield document, *[review[col] for col in ret]

In [6]:
def make_processed_datasets():
    """Make fasttext-style dataset, with each line being a text.
    
    Create files `../data/processed/train.txt` and `../data/processed/test.txt`, with each line
    being on the form `__label__<low/medium/high> <a document, without linebreaks>.
    
    :return: None
    """
    for split in ["train", "test"]:
        num_docs = len(datasets[split])
        file = open(f"../data/processed/{split}.txt", "w")
        file.close()
        file = open(f'../data/processed/{split}.txt', 'a')
        for i, (doc, rating, authors) in enumerate(get_documents(path="../data", metadata_df = datasets[split])):
            print(f"\rGoing through document {i}/{num_docs}",
                  flush=True,
                  end='')
            text_list = []
            clean_doc = "".join(clean_document(doc, stop_words=stop_words, lemmatizer=lemmatizer, remove_newlines=True))
            if rating <= 2:
                grouped_rating = "low"
            elif rating <= 4:
                grouped_rating = "medium"
            else:
                grouped_rating = "high"
            file.write(f"__label__{grouped_rating} {clean_doc}\n")
        file.close()
        print("")

In [7]:
metadata_df = pd.read_json(f"../data/metadata.json").T

In [None]:
# Her finner vi alle forfattere, og alle fornavn
# Denne koden trenger du ikke kjøre hver gang

metadata_df["num_authors"] = metadata_df.authors.apply(lambda authors: len(authors))
all_authors = []
for authors in metadata_df.authors:
    all_authors.extend(authors)
all_authors = list(set(all_authors))

first_names = list(set([full_name.split()[0] for full_name in all_authors]))

In [None]:
# Her labler vi manuelt fornavn med kjønn, og så settes de inn i en ny dataframe med forfattere, så vi ser hvilket kjønn hver forfatter har
# Denne koden trenger du ikke kjøre hver gang

genders = {}
i = 0
while i < len(first_names):
    gender = input(f"Sex of `{first_names[i]} ({i}/{len(first_names)})`: ")
    if gender.lower() in ["m", "k", "u"]:
        genders[first_names[i]] = gender.lower()
        i += 1
    elif gender.lower() == "r":
        i -= 1
        print("Correcting error, type last gender again.")
    else:
        print("Type either `m` for male, `k` for female, or `u` for unknown/other.")

authors_df = pd.DataFrame(index=all_authors, data={"name": all_authors})

def apply_firstname_gender(full_name):
    return genders[full_name.split()[0]]

authors_df["gender"] = authors_df.name.apply(apply_firstname_gender)
authors_df.to_csv("../data/authors.csv")
display(authors_df)

In [14]:
# Her setter vi inn kjønnsinformasjonen i metadata_df

authors_df = pd.read_csv("../data/authors.csv", index_col="name")
def set_gender(authors):
    if len(authors) == 1:
        return authors_df.gender[authors[0]]
    else:
        gender = authors_df.gender[authors[0]]
        for author in authors[1:]:
            if gender != authors_df.gender[author]:
                return "u"
        return gender
            
metadata_df["gender"] = metadata_df.authors.apply(set_gender)

In [15]:
train_metadata_df, test_metadata_df = filter_split_dataset(metadata_df = metadata_df)
datasets = {'train': train_metadata_df, 'test': test_metadata_df}
lemmatizer = spacy.load("nb_core_news_sm")

In [16]:
# Extract stop words in bokmål from http://snowball.tartarus.org/algorithms/norwegian/stop.txt
stop_words = []
with open("../data/stop_words.txt", "r") as stop_words_file:
    for line in stop_words_file:
        if len(line) >= 2 and line[2] != "|":
            stop_word, explanation, = line.split("|")
            if len(stop_word) > 1 and explanation[-2] != "*":
                stop_words.append(stop_word.strip())
print(stop_words)

['og', 'i', 'jeg', 'det', 'at', 'en', 'et', 'den', 'til', 'er', 'som', 'på', 'de', 'med', 'han', 'av', 'ikke', 'der', 'så', 'var', 'meg', 'seg', 'men', 'ett', 'har', 'om', 'vi', 'min', 'mitt', 'ha', 'hadde', 'hun', 'nå', 'over', 'da', 'ved', 'fra', 'du', 'ut', 'sin', 'dem', 'oss', 'opp', 'man', 'kan', 'hans', 'hvor', 'eller', 'hva', 'skal', 'selv', 'sjøl', 'her', 'alle', 'vil', 'bli', 'ble', 'blitt', 'kunne', 'inn', 'når', 'være', 'kom', 'noen', 'noe', 'ville', 'dere', 'som', 'deres', 'kun', 'ja', 'etter', 'ned', 'skulle', 'denne', 'for', 'deg', 'si', 'sine', 'sitt', 'mot', 'å', 'meget', 'hvorfor', 'dette', 'disse', 'uten', 'hvordan', 'ingen', 'din', 'ditt', 'blir', 'samme', 'hvilken', 'hvilke', 'sånn', 'inni', 'mellom', 'vår', 'hver', 'hvem', 'vors', 'hvis', 'både', 'bare', 'enn', 'fordi', 'før', 'mange', 'også', 'slik', 'vært', 'være', 'begge', 'siden', 'henne', 'hennar', 'hennes']


In [None]:
# Dette lager to datasett, et treningssett og et testsett, for fasttext-modellen
# Vi trenger ikke kjøre dette hver gang, bare første gang vi bruker det på en spesifikk datamaskin
#make_processed_datasets()

In [None]:
model = fasttext.train_supervised(input="../data/processed/train.txt", epoch=15, lr=1.0, wordNgrams=2, verbose=2)

In [None]:
model.save_model("fasttext_model.bin")

In [None]:
model.test("../data/processed/test.txt")

In [None]:
def predict(doc, model, lemmatizer = None):
    """Process text, and use the model to predict a label.
    
    :param doc: Str
    :param model: Model with predict method.
    :lemmatizer: Function Str -> iter(Str,)
    """
    if lemmatizer is None:
        lemmatizer = spacy.load("nb_core_news_sm")
    
    return model.predict(clean_document(doc, stop_words=stop_words))

In [None]:
predict("Dette var en fantastisk film, helt feilfri", model, lemmatizer=lemmatizer)

In [None]:
# Naive Bayes

def bayes_maker():
    num_docs = len(datasets["train"])
    for i, (doc, rating, authors) in enumerate(get_documents(path="../data", metadata_df = datasets["train"])):
            print(f"\rGoing through document {i}/{num_docs}",
                  flush=True,
                  end='')
            text_list = clean_document(doc, stop_words=stop_words, lemmatizer=lemmatizer, remove_newlines=True)

            yield dict([token, True] for token in text_list), f"__label__{rating}"

In [None]:
bayes_dataset = list(bayes_maker())

In [None]:
print(type(bayes_dataset), type(bayes_dataset[0]), bayes_dataset[0])

In [None]:
classifier = NaiveBayesClassifier.train(bayes_dataset)

In [None]:
print("Accuracy is:", classify.accuracy(classifier, bayes_dataset))