In [2]:
!python -m spacy download nb_core_news_sm > /dev/null

/opt/anaconda3/bin/python: No module named spacy


In [3]:
import os
import pandas as pd
import spacy
from spacy.lang.nb.examples import sentences 
from nltk.tokenize import word_tokenize
import re
import string
from nltk.corpus import stopwords
import fasttext
from nltk import NaiveBayesClassifier
from nltk import classify
import random
import tqdm
from itertools import groupby
import pickle

ModuleNotFoundError: No module named 'spacy'

In [None]:
def filter_split_dataset(metadata_df = None):
    """Filter out nynorsk, and split into train and test datasets.
    
    :param metadata_df: pd.DataFrame object with metadata.
    
    :return: 2-tuple of train and test pd.DataFrames
    """
    if metadata_df is None:
        metadata_df = pd.read_json(f"data/metadata.json").T
        
    bokmål_mask = metadata_df.language == "nb"
    
    metadata_df = metadata_df[bokmål_mask]
    
    train_mask = metadata_df.split == "train"
    test_mask = metadata_df.split == "test"
        
    return metadata_df[train_mask], metadata_df[test_mask]
    

In [None]:
def process_documents(doc, lemmatizer=None, remove_newlines=False):
    """Tokenize and lemmatize.
    
    :param document: String.
    :param remove_newlines: Bool, whether to remove newline characters.
    :param lemmatizer: Function Str -> iter(Str,)
    
    :return: [Spacy Doc? object,]
    """
    
    assert isinstance(doc, str), "doc has to be of type str, {type(doc)} is not supported."
    
    if lemmatizer is None:
        lemmatizer = spacy.load("nb_core_news_sm")
    
    if remove_newlines:
        doc = re.sub('\n', '', doc)
    
    return lemmatizer(doc)

In [None]:
def clean_document(doc, stop_words, **process_kwargs):
    """Cleans up document, normalising words and removing stop_words.
    
    :param doc: [Str]
    :param stop_words: [Str] to exclude
    :param **process_kwargs: Passed to process_documents
    
    :return: [[Str]], list of tokens for each doc
    """
    full_list = []
    for token in process_documents(doc, **process_kwargs):
        if token.lemma_ not in stop_words:
            if token.lemma_ in string.punctuation:
                full_list.append(token.lemma_)
            else:
                full_list.append(f" {token.lemma_}")
    
    return full_list

In [None]:
def get_documents(metadata_df=None, path="data", ret=["rating", "authors"]):
    """Get documents of a specific type.
        
    :param path: Str path to folder with test and train folders.
    :param dataset: Determines which type to look for. Either `train` or `test`.
    :param ret: Columns from each document to return alongside the document itself. [Rating and authors]
    
    :return: iter(Str,) of documents
    """
    
    if metadata_df is None:
        metadata_df = pd.read_json(f"{path}/metadata.json").T
    
    full_path = f"{path}/%s/%s{'p' if processed else ''}.txt"
    
    for (_, review) in metadata_df.iterrows():
        document = open(f"{path}/{review['split']}/{str(review['id']).zfill(6)}.txt", "r").read()
        yield document, *[review[col] for col in ret]

In [None]:
def make_processed_datasets(debug=False, **datasets):
    """Make fasttext-style dataset, with each line being a text.
    
    :param debug: Set to true to only process first doc.
    :param **datasets: pd.DataFrames of the metadata format
    
    Create files `../data/processed/<kwarg_key>.txt`, with each line
    being on the form `__label__<1-6> <a document, without linebreaks>.
    
    :return: None
    """
    for split, dataset in datasets.items():
        num_docs = len(dataset)
        
        text_list = []
        file = open(f'../data/processed/{split}.txt', 'a')
        for doc, rating in tqdm.tqdm(get_documents(path="../data", metadata_df = dataset, ret=["rating"]), total=len(dataset)):
            
            clean_str = "".join(clean_document(doc, stop_words=stop_words, lemmatizer=lemmatizer, remove_newlines=True))
            text_list.append(f"__label__{rating} {clean_str}\n")
            
            if debug:
                break
                    
        random.shuffle(text_list)
        
        with open(f"../data/processed/{split}.txt", "w") as file:
            file.write("\n".join(text_list))
        

In [None]:
metadata_df = pd.read_json(f"../data/metadata.json").T

In [None]:
def set_gender(authors, authors_df):
    """Gives gender of a list of authors based on labeled names in a df.
    
    :param authors: [Str,] where each element is an author
    :param authors_df: pd.DataFrame with names as index and a column with gender info.
    
    :return: `m` for male, `k` for female, or `u` for unknown or ambigous.
    """
    if len(authors) == 1:
        return authors_df.gender[authors[0]]
    else:
        gender = authors_df.gender[authors[0]]
        for author in authors[1:]:
            if gender != authors_df.gender[author]:
                return "u"
        return gender

def label_author_gender(metadata_df):
    """Finds all author names, and prompts user to label their gender.
    
    Stores author genders in `../data/authors.csv`.
    
    :param metadata_df: pd.DataFrame
    
    :return: metadata_df with gender column.
    """
    
    all_authors = []
    for authors in metadata_df.authors:
        all_authors.extend(authors)
    all_authors = list(set(all_authors))

    first_names = list(set([full_name.split()[0] for full_name in all_authors]))

    genders = {}
    i = 0
    while i < len(first_names):
        gender = input(f"Sex of `{first_names[i]}` ({i}/{len(first_names)})`: ")
        if gender.lower() in ["m", "k", "u"]:
            genders[first_names[i]] = gender.lower()
            i += 1
        elif gender.lower() == "r":
            i -= 1
            print("Correcting error, type last gender again.")
        else:
            print("Type either `m` for male, `k` for female, or `u` for unknown/other.")

    authors_df = pd.DataFrame(index=all_authors, data={"name": all_authors})

    def apply_firstname_gender(full_name):
        return genders[full_name.split()[0]]

    authors_df["gender"] = authors_df.name.apply(apply_firstname_gender)
    authors_df.to_csv("../data/authors.csv")

    metadata_df["gender"] = metadata_df.authors.apply(lambda authors: set_gender(authors, authors_df))
    
    return metadata_df

if os.path.exists("../data/authors.csv") or "gender" in metadata_df.columns:
    overwrite = input("Vil du overskrive dataen om kjønnet til forfattere? [Y/n] ").lower()
    
    if overwrite in ["y", "yes"]:
        metadata_df = label_author_gender(metadata_df)
    
    if not "gender" in metadata_df.columns:
        authors_df = pd.read_csv("../data/authors.csv", index_col="name")
        metadata_df["gender"] = metadata_df.authors.apply(lambda authors: set_gender(authors, authors_df))
else:
    metadata_df = label_author_gender(metadata_df)

In [None]:
metadata_df["rating"] = metadata_df["rating"].astype(int)
metadata_df.groupby("gender")["rating"].mean()

In [None]:
train_metadata_df, test_metadata_df = filter_split_dataset(metadata_df = metadata_df)
datasets = {'train': train_metadata_df, 'test': test_metadata_df}
lemmatizer = spacy.load("nb_core_news_sm")

In [None]:
# Extract stop words in bokmål from http://snowball.tartarus.org/algorithms/norwegian/stop.txt
stop_words = []
with open("../data/stop_words.txt", "r") as stop_words_file:
    for line in stop_words_file:
        if len(line) >= 2 and line[2] != "|":
            stop_word, explanation, = line.split("|")
            if len(stop_word) > 1 and explanation[-2] != "*":
                stop_words.append(stop_word.strip())
print(stop_words)

In [None]:
# Dette lager to datasett, et treningssett og et testsett, for fasttext-modellen
# Vi trenger ikke kjøre dette hver gang, bare første gang vi bruker det på en spesifikk datamaskin
if any (os.path.exists(f"../data/processed/{split}.txt") for split in ["train", "test"]):
    overwrite = input(f"Vil du overskrive de prosesserte datasettetene ditt? [Y/n] ").lower()
    if overwrite in ["y", "yes"]:
        make_processed_datasets(**datasets)
else:
    make_processed_datasets(**datasets)

In [None]:
model = fasttext.train_supervised(input="../data/processed/train.txt", epoch=30, lr=1.0, wordNgrams=3, verbose=2)

In [None]:
model.save_model("fasttext_model.bin")

In [None]:
model.test("../data/processed/test.txt")

In [None]:
def predict(doc, model, lemmatizer=None, stop_words=stop_words):
    """Process text, and use the model to predict a label.
    
    :param doc: Str
    :param model: Model with predict method.
    :lemmatizer: Function Str -> iter(Str,)
    """
    if lemmatizer is None:
        lemmatizer = spacy.load("nb_core_news_sm")
    
    clean_doc = "".join(clean_document(doc, stop_words=stop_words, lemmatizer=lemmatizer, remove_newlines=True))
    prediction = model.predict(clean_doc)
    return prediction[0][0][-1], clean_doc

In [None]:
predict("Dette var en fantastisk film, leverte på alle punkter. Min favorittdel av den var den dramatiske scenen, der hovedpersonen får vite om sin skjebne.", model, lemmatizer=lemmatizer)

## Gjøreliste:

For å finne mønster vil vi sjekke de enkeltordene, og gruppene på to og tre ord, hvor det er størst forskjell på bruken hos kvinner og menn, og hvor ordene totalt er brukt nok til at vi kan tro på at det er sannsynlig at dette er et reelt mønster.

Mer konkret, må vi altså implementere følgende:  
~~1) En tokenizer og wordnet-greie som lager tokens.~~  
~~2) En funksjon som henter ut grupper på 1, 2 og 3 ord, sortert etter frekvens~~~
~~3) Hente ut de ordene eller ordgruppene som er brukt mer enn for eksempel 50 ganger totalt.~~  
~~~4) Dele datasettet inn i menn og kvinner, og gjøre ei vurdering på hvor sikker vi her må være på kjønn (0.9 ser bra ut~~
  
5) Sjekke hvor ofte hver av ordgruppene forekommer hos kvinner og menn  
6) Sortere ordgruppene etter den betingede nsynligheten for at noen er kvinne gitt at de har brukt denne ordgruppen  
  
7) Manuelt søke etter ordgrupper som brukes mye av menn eller kvinner, men som har et synonym hos det andre kjønnet  
8) Manuelt konstruere setninger som bruker disse ordene, og se om vår sentimentalgoritme gir disse rent kjønnede ordene forskjellig positivitets-verdi  

9) Sammenligne den relative frekvensen av ordgruppene hos menn, kvinner, og positive og negative sentimenter

Sjekke antall ord per setning, og stavelser per ord

Kontrollere for kjønn i forskjellige sjangre

Fjerne ordgrupper som brukes mye, men av veldig få?

Her ser vi altså ikke på mer avanserte mønster, som om setningsoppbyggingen er forskjellig.

In [None]:
def sort_by_value(unsorted_dict):
    """Sorts dictionary by its value
    
    :param unsorted_dict: Dict
    
    :return: Sorted dict, descending
    """
    
    return {k: v for k, v in sorted(unsorted_dict.items(), key=lambda item: item[1], reverse=True)}

In [None]:
def sentencize(lemmatizer=None, **datasets):
    """Take in datasets, and return all text, split on sentences.
    
    :param **datasets: pd.DataFrame metadata-style object.
    
    :return: Dict where each dataset has a list of strings, each being a sentence.
    """
    
    if lemmatizer is None:
        lemmatizer = spacy.load("nb_core_news_sm")
    
    ret = {}
    with tqdm.tqdm(total=sum([len(dataset) for _, dataset in datasets.items()])) as pbar:
        for split, dataset in datasets.items():
            ret[split] = []
            for doc, in get_documents(metadata_df=dataset, path="../data", ret=[]):
                pbar.update(1)
                for sentence in process_documents(doc, lemmatizer=lemmatizer, remove_newlines=True).sents:
                    ret[split].append(sentence.text.split())
    return ret
    

In [None]:
def group_words(length, **sentence_sets):
    """Groups words in lists of length == length
        
    :param length: Int. Number of words in each string to return.
    :param **sentence_sets: [Str], where each string is a sentence.
    
    :return: Dictionary with all unique groups, string : int number of occurences.
    """
    word_groups = {}
    with tqdm.tqdm(total = sum([len(sentence_set) for sentence_set in sentence_sets])) as pbar:
        for split, sentence_set in sentence_sets.items():
            pbar.update(1)
            word_groups[split] = []
            for sentence in sentence_set:
                for i in range(len(sentence) - length):
                    word_groups[split].append(" ".join([sentence[i+ii] for ii in range(length)]))
            word_groups[split].sort()
            word_groups[split] = {key: len(list(group)) for key, group in groupby(word_groups[split])}
            word_groups[split] = sort_by_value(word_groups[split])
    return word_groups

In [None]:
def generate_gendered_dataset(dataset):
    """Split dataset based on gender.
    
    :param dataset: pd.DataFrame metadata-style object.
    
    :return: Dict with male, female and unknown pd.DataFrame metadata-style objects
    """
        
    return {gender: dataset[dataset.gender == gender] for gender in ["m", "k", "u"]}

In [None]:
def sentence_complexity(lemmatizer=None, **sentence_sets):
    """Finds the average word count per sentence, and char count per word.
    
    :param lemmatizer:
    :param **sentence_sets:
    
    :return: Dict with same keys as sentence_sets, and 2-tuples with avg word/sent and char/word as values.
    """
    
    if lemmatizer is None:
        lemmatizer = spacy.load("nb_core_news_sm")
        
    complexity = {}
    for split, sentence_set in sentence_sets.items():
        words = 0
        chars = 0
        for sentence in sentence_set:
            words += len(sentence)
            chars += len("".join(sentence))
        
        words_per_sent = words / len(sentence_set)
        chars_per_word = chars / words
        
        complexity[split] = (words_per_sent, chars_per_word)
        
    return complexity

In [None]:
def word_ratios(ratio_threshold, absolute_threshold, **word_groups_dicts):
    """Finds the words that are used most in a group
    
    TODO: Normalise with regard to amount of words written by each group, and in each tag
    
    :param ratio_threshold: Float
    :param absolute_threshold: Int
    :param **word_groups: Dictionary with all unique groups, string : int number of occurences.
    
    :return: Dictionary with the 
    """
    split = list(word_groups_dicts.keys())
    split_combos = [(split[i], split[j]) for j in range(len(split)) for i in range(len(split)) if i != j]
    
    ratios = {}
    for split1, split2 in split_combos:
        ratios[split1 + split2] = {}
        for word_group in word_groups_dicts[split1]:
            if word_group in word_groups_dicts[split2] and word_groups_dicts[split2][word_group] >= absolute_threshold:
                ratio = word_groups_dicts[split1][word_group]/word_groups_dicts[split2][word_group]
                if ratio >= ratio_threshold:
                    ratios[split1 + split2][word_group] = ratio
        
        ratios[split1 + split2] = sort_by_value(ratios[split1 + split2])
    
    return ratios

In [None]:
gender_dataset = generate_gendered_dataset(datasets["train"])

overwrite = "y"
if os.path.exists("../data/sentence_sets.pkl"):
    overwrite = input("Vil du overskrive dataen om setningssammensetning? [Y/n] ").lower()
    
if overwrite in ["y", "yes"]:
    sentence_set = sentencize(lemmatizer=lemmatizer, **gender_dataset)

    with open("../data/sentence_sets.pkl", "wb") as handle:
        pickle.dump(sentence_set, handle, protocol=pickle.HIGHEST_PROTOCOL)
else:
    with open("../data/sentence_sets.pkl", "rb") as handle:
        sentence_set = pickle.load(handle)

In [None]:
word_groups = [0 for _ in range(5)]
for i in range(1,5):
    print(f"\rWorking on finding word groups of length {i}", flush=True, end="")
    word_groups[i] = group_words(i, **sentence_set)
print("Found word groups for all lengths up to 4")

In [None]:
complexity = sentence_complexity(lemmatizer=lemmatizer, **sentence_set)

In [None]:
print(complexity) # Ikke signifikant

In [None]:
for i, _word_groups in enumerate(word_groups[1:]):
    ratios = word_ratios(1.5, 10, **_word_groups)
    print(f"Ordgrupper med lengde {i+1}")
    for split_combos, desc in [("km", "Kvinner mer enn menn"), ("mk", "Menn mer enn kvinner")]:
        print(f"   {desc}:")
        for i, (word, ratio) in enumerate(ratios[split_combos].items()):
            if i >= 10:
                break
            print(f"         {word}: {ratio:.1f}")

In [None]:
def eec_generator(templates, people, feelings):
    """Generates corpus with all combinations of templates, people and feelings.
    
    :param templates: [Str]
    :param people: [(Str: id, Str: value)] 
    :param feelings: [(Str: id, Str: value)]
    
    :return: 2-tuple og corpus and ids
    """
    
    corpus = []
    ids = []
    for template in templates:
        for person in people:
            for feeling in feelings:
                corpus.append(template.format(person=person[1], feeling=feeling[1]))
                ids.append((template, person[0], person[0]))
    
    return corpus, ids

In [None]:
templates = ["{person} er {feeling}",
             "Situasjonen får {person} til å bli {feeling}",
             "Jeg fikk {person} til å bli {feeling}"]
people = [("m", "sønnen min"), ("k", "datteren min"), ("m", "han"), ("k", "henne")]
feelings = [("anger", "sint"), ("joy", "glad")]

eec_corpus = eec_generator(templates, people, feelings)

In [None]:
for prompt, identifier in zip(*eec_corpus):
    prediction = predict(prompt, model, lemmatizer=lemmatizer)
    print(f"{prompt}/{prediction[1]}: {prediction[0]}")

In [None]:
# Tester at modellen gir andre svar enn 3 og 4, og fungerer litt iallfall
doc = open(f"../data/test/000307.txt", "r").read()
clean = clean_document(doc, stop_words, lemmatizer=lemmatizer, remove_newlines=True)
print(f"Fasit er 4, modellen gjetter {predict(doc, model, lemmatizer=lemmatizer)}")

To analyseretninger herfra:

1) Se om modellen gir bedre, dårligere, eller mer varierende anmeldelser hvis noe handler tydelig om menn enn om kvinner
2) Se om modellen gir bedre, dårligere, eller mer varierende anmeldelser hvis ordene som brukes er mer kvinnelige eller mannlige - TODO: Sjekk om modellen rangererer kvinnelig forfatterte tekster annerledes

In [None]:
doc_org = open(f"../data/gender_eval/000028.txt", "r").read()
dog_k= open(f"../data/gender_eval/000028k.txt", "r").read()
print(f"For orginal tekst gjetter modellen {predict(doc_org, model, lemmatizer=lemmatizer)}, mens for teksten modifisert med kvinnelige ord gjetter modellen {predict(doc_k, model, lemmatizer=lemmatizer)}")