In [None]:
!python -m spacy download nb_core_news_sm > /dev/null

In [185]:
import os
import pandas as pd
import spacy
from spacy.lang.nb.examples import sentences 
from nltk.tokenize import word_tokenize
import re
import string
from nltk.corpus import stopwords
import fasttext
from nltk import NaiveBayesClassifier
from nltk import classify
import random
import tqdm
from itertools import groupby

In [72]:
def filter_split_dataset(metadata_df = None):
    """Filter out nynorsk, and split into train and test datasets.
    
    :param metadata_df: pd.DataFrame object with metadata.
    
    :return: 2-tuple of train and test pd.DataFrames
    """
    if metadata_df is None:
        metadata_df = pd.read_json(f"data/metadata.json").T
        
    bokmål_mask = metadata_df.language == "nb"
    
    metadata_df = metadata_df[bokmål_mask]
    
    train_mask = metadata_df.split == "train"
    test_mask = metadata_df.split == "test"
        
    return metadata_df[train_mask], metadata_df[test_mask]
    

In [172]:
def process_documents(doc, lemmatizer=None, remove_newlines=False):
    """Tokenize and lemmatize.
    
    :param document: String.
    :param remove_newlines: Bool, whether to remove newline characters.
    :param lemmatizer: Function Str -> iter(Str,)
    
    :return: [Spacy Doc? object,]
    """
    
    assert isinstance(doc, str), "doc has to be of type str, {type(doc)} is not supported."
    
    if lemmatizer is None:
        lemmatizer = spacy.load("nb_core_news_sm")
    
    if remove_newlines:
        doc = re.sub('\n', '', doc)
    
    return lemmatizer(doc)

In [173]:
def clean_document(doc, stop_words, **process_kwargs):
    """Cleans up document, normalising words and removing stop_words.
    
    :param doc: [Str]
    :param stop_words: [Str] to exclude
    :param **process_kwargs: Passed to process_documents
    
    :return: [[Str]], list of tokens for each doc
    """
    full_list = []
    for token in process_documents(doc, **process_kwargs):
        if token.lemma_ not in stop_words:
            if token.lemma_ in string.punctuation:
                full_list.append(token.lemma_)
            else:
                full_list.append(f" {token.lemma_}")
    
    return full_list

In [75]:
def get_documents(metadata_df=None, path="data", processed=False, ret=["rating", "authors"]):
    """Get documents of a specific type.
    
    Filter out reviews in nynorsk.
    
    :param path: Str path to folder with test and train folders.
    :param dataset: Determines which type to look for. Either `train` or `test`.
    :param processed: Whether to look for already processed data or not.
    :param ret: Columns from each document to return alongside the document itself. [Rating and authors]
    
    :return: iter(Str,) of documents
    """
    
    if metadata_df is None:
        metadata_df = pd.read_json(f"{path}/metadata.json").T
    
    full_path = f"{path}/%s/%s{'p' if processed else ''}.txt"
    
    for (_, review) in metadata_df.iterrows():
        document = open(f"{path}/{review['split']}/{str(review['id']).zfill(6)}{'p' if processed else ''}.txt", "r").read()
        yield document, *[review[col] for col in ret]

In [93]:
def make_processed_datasets(debug=False, **datasets):
    """Make fasttext-style dataset, with each line being a text.
    
    :param debug: Set to true to only process first doc.
    :param **datasets: pd.DataFrames of the metadata format
    
    Create files `../data/processed/<kwarg_key>.txt`, with each line
    being on the form `__label__<1-6> <a document, without linebreaks>.
    
    :return: None
    """
    for split, dataset in datasets.items():
        num_docs = len(dataset)
        
        text_list = []
        file = open(f'../data/processed/{split}.txt', 'a')
        for doc, rating in tqdm.tqdm(get_documents(path="../data", metadata_df = dataset, ret=["rating"]), total=len(dataset)):
            
            clean_str = "".join(clean_document(doc, stop_words=stop_words, lemmatizer=lemmatizer, remove_newlines=True))
            text_list.append(f"__label__{rating} {clean_str}\n")
            
            if debug:
                break
                    
        random.shuffle(text_list)
        
        with open(f"../data/processed/{split}.txt", "w") as file:
            file.write("\n".join(text_list))
        

In [77]:
metadata_df = pd.read_json(f"../data/metadata.json").T

In [78]:
def set_gender(authors, authors_df):
    """Gives gender of a list of authors based on labeled names in a df.
    
    :param authors: [Str,] where each element is an author
    :param authors_df: pd.DataFrame with names as index and a column with gender info.
    
    :return: `m` for male, `k` for female, or `u` for unknown or ambigous.
    """
    if len(authors) == 1:
        return authors_df.gender[authors[0]]
    else:
        gender = authors_df.gender[authors[0]]
        for author in authors[1:]:
            if gender != authors_df.gender[author]:
                return "u"
        return gender

def label_author_gender(metadata_df):
    """Finds all author names, and prompts user to label their gender.
    
    Stores author genders in `../data/authors.csv`.
    
    :param metadata_df: pd.DataFrame
    
    :return: metadata_df with gender column.
    """
    
    all_authors = []
    for authors in metadata_df.authors:
        all_authors.extend(authors)
    all_authors = list(set(all_authors))

    first_names = list(set([full_name.split()[0] for full_name in all_authors]))

    genders = {}
    i = 0
    while i < len(first_names):
        gender = input(f"Sex of `{first_names[i]} ({i}/{len(first_names)})`: ")
        if gender.lower() in ["m", "k", "u"]:
            genders[first_names[i]] = gender.lower()
            i += 1
        elif gender.lower() == "r":
            i -= 1
            print("Correcting error, type last gender again.")
        else:
            print("Type either `m` for male, `k` for female, or `u` for unknown/other.")

    authors_df = pd.DataFrame(index=all_authors, data={"name": all_authors})

    def apply_firstname_gender(full_name):
        return genders[full_name.split()[0]]

    authors_df["gender"] = authors_df.name.apply(apply_firstname_gender)
    authors_df.to_csv("../data/authors.csv")

    metadata_df["gender"] = metadata_df.authors.apply(lambda authors: set_gender(authors, authors_df))
    
    return metadata_df

if os.path.exists("../data/authors.csv") or "gender" in metadata_df.columns:
    overwrite = input("Vil du overskrive dataen om kjønnet til forfattere? [Y/n] ").lower()
    
    if overwrite in ["y", "yes"]:
        metadata_df = label_author_gender(metadata_df)
    
    if not "gender" in metadata_df.columns:
        authors_df = pd.read_csv("../data/authors.csv", index_col="name")
        metadata_df["gender"] = metadata_df.authors.apply(lambda authors: set_gender(authors, authors_df))
else:
    metadata_df = label_author_gender(metadata_df)

Vil du overskrive dataen om kjønnet til forfattere? [Y/n]  n


In [79]:
train_metadata_df, test_metadata_df = filter_split_dataset(metadata_df = metadata_df)
datasets = {'train': train_metadata_df, 'test': test_metadata_df}
lemmatizer = spacy.load("nb_core_news_sm")

In [80]:
# Extract stop words in bokmål from http://snowball.tartarus.org/algorithms/norwegian/stop.txt
stop_words = []
with open("../data/stop_words.txt", "r") as stop_words_file:
    for line in stop_words_file:
        if len(line) >= 2 and line[2] != "|":
            stop_word, explanation, = line.split("|")
            if len(stop_word) > 1 and explanation[-2] != "*":
                stop_words.append(stop_word.strip())
print(stop_words)

['og', 'i', 'jeg', 'det', 'at', 'en', 'et', 'den', 'til', 'er', 'som', 'på', 'de', 'med', 'han', 'av', 'ikke', 'der', 'så', 'var', 'meg', 'seg', 'men', 'ett', 'har', 'om', 'vi', 'min', 'mitt', 'ha', 'hadde', 'hun', 'nå', 'over', 'da', 'ved', 'fra', 'du', 'ut', 'sin', 'dem', 'oss', 'opp', 'man', 'kan', 'hans', 'hvor', 'eller', 'hva', 'skal', 'selv', 'sjøl', 'her', 'alle', 'vil', 'bli', 'ble', 'blitt', 'kunne', 'inn', 'når', 'være', 'kom', 'noen', 'noe', 'ville', 'dere', 'som', 'deres', 'kun', 'ja', 'etter', 'ned', 'skulle', 'denne', 'for', 'deg', 'si', 'sine', 'sitt', 'mot', 'å', 'meget', 'hvorfor', 'dette', 'disse', 'uten', 'hvordan', 'ingen', 'din', 'ditt', 'blir', 'samme', 'hvilken', 'hvilke', 'sånn', 'inni', 'mellom', 'vår', 'hver', 'hvem', 'vors', 'hvis', 'både', 'bare', 'enn', 'fordi', 'før', 'mange', 'også', 'slik', 'vært', 'være', 'begge', 'siden', 'henne', 'hennar', 'hennes']


In [94]:
# Dette lager to datasett, et treningssett og et testsett, for fasttext-modellen
# Vi trenger ikke kjøre dette hver gang, bare første gang vi bruker det på en spesifikk datamaskin
if any (os.path.exists(f"../data/processed/{split}.txt") for split in ["train", "test"]):
    overwrite = input(f"Vil du overskrive de prosesserte datasettetene ditt? [Y/n] ").lower()
    if overwrite in ["y", "yes"]:
        make_processed_datasets(**datasets)
else:
    make_processed_datasets(**datasets)

Vil du overskrive de prosesserte datasettetene ditt? [Y/n]  y


100%|██████████| 34478/34478 [27:03<00:00, 21.24it/s]
100%|██████████| 4281/4281 [03:54<00:00, 18.26it/s]


In [101]:
model = fasttext.train_supervised(input="../data/processed/train.txt", epoch=30, lr=1.0, wordNgrams=3, verbose=2)

In [127]:
model.save_model("fasttext_model.bin")

In [102]:
model.test("../data/processed/test.txt")

(4281, 0.5706610604998832, 0.5706610604998832)

In [122]:
def predict(doc, model, lemmatizer = None):
    """Process text, and use the model to predict a label.
    
    :param doc: Str
    :param model: Model with predict method.
    :lemmatizer: Function Str -> iter(Str,)
    """
    if lemmatizer is None:
        lemmatizer = spacy.load("nb_core_news_sm")
        
    return model.predict("".join(clean_document(doc, stop_words=stop_words)))

In [124]:
predict("Dette var en fantastisk film", model, lemmatizer=lemmatizer)

(('__label__1',), array([0.99594736]))

## Gjøreliste:

For å finne mønster vil vi sjekke de enkeltordene, og gruppene på to og tre ord, hvor det er størst forskjell på bruken hos kvinner og menn, og hvor ordene totalt er brukt nok til at vi kan tro på at det er sannsynlig at dette er et reelt mønster.

Mer konkret, må vi altså implementere følgende:  
~~1) En tokenizer og wordnet-greie som lager tokens.~~  
2) En funksjon som henter ut grupper på 1, 2 og 3 ord, sortert etter frekvens.
3) Hente ut de ordene eller ordgruppene som er brukt mer enn for eksempel 50 ganger totalt.  
4) Dele datasettet inn i menn og kvinner, og gjøre ei vurdering på hvor sikker vi her må være på kjønn (0.9 ser bra ut)  
5) Sjekke hvor ofte hver av ordgruppene forekommer hos kvinner og menn  
6) Sortere ordgruppene etter den betingede sannsynligheten for at noen er kvinne gitt at de har brukt denne ordgruppen  
  
7) Manuelt søke etter ordgrupper som brukes mye av menn eller kvinner, men som har et synonym hos det andre kjønnet  
8) Manuelt konstruere setninger som bruker disse ordene, og se om vår sentimentalgoritme gir disse rent kjønnede ordene forskjellig positivitets-verdi  

9) Sammenligne den relative frekvensen av ordgruppene hos menn, kvinner, og positive og negative sentimenter

Sjekke antall ord per setning, og stavelser per ord

Kontrollere for kjønn i forskjellige sjangre

Fjerne ordgrupper som brukes mye, men av veldig få?

Her ser vi altså ikke på mer avanserte mønster, som om setningsoppbyggingen er forskjellig.

In [281]:
def group_words(length, **sentence_sets):
    """Groups words in lists of length == length
    
    :param length: Int. Number of words in each string to return.
    :param **sentence_sets: [Str], where each string is a sentence.
    
    :return: Dictionary with all unique groups, string : int number of occurences.
    """
    word_groups = {}
    for split, sentence_set in sentence_sets.items():
        #print(split, sentence_set[:10])
        word_groups[split] = []
        for sentence in sentence_set:
            for i in range(len(sentence)):
                word_groups[split].append(" ".join([sentence[i+ii] if i + ii < len(sentence) else "" for ii in range(length)]))
        word_groups[split].sort()
        #print(type(word_groups[split]))
        word_groups[split] = {key: len(list(group)) for key, group in groupby(word_groups[split])}
        word_groups[split] = {k: v for k, v in sorted(word_groups[split].items(), key=lambda item: item[1], reverse=True)}
        #break
    return word_groups

In [231]:
def sentencize(lemmatizer=None, **datasets):
    """Take in datasets, and return all text, split on sentences.
    
    :param **datsets: pd.DataFrame metadata-style object.
    
    :return: Dict where each dataset has a list of strings, each being a sentence.
    """
    
    if lemmatizer is None:
        lemmatizer = spacy.load("nb_core_news_sm")
    
    ret = {}
    for split, dataset in datasets.items():
        ret[split] = []
        for doc, in tqdm.tqdm(get_documents(metadata_df=dataset, path="../data", ret=[]), total=len(dataset)):
            for sentence in process_documents(doc, lemmatizer=lemmatizer, remove_newlines=True).sents:
                ret[split].append(sentence.text.split())
    return ret
    

In [182]:
def generate_gendered_dataset(dataset):
    """Split dataset based on gender.
    
    :param dataset: pd.DataFrame metadata-style object.
    
    :return: Dict with male, female and unknown pd.DataFrame metadata-style objects
    """
        
    return {gender: dataset[dataset.gender == gender] for gender in ["m", "k", "u"]}

In [286]:
gender_dataset = generate_gendered_dataset(datasets["train"])
sentence_set = sentencize(lemmatizer=lemmatizer, **gender_dataset)
word_groups = group_words(2, **sentence_set)