Assignment 3 - Diachronic analysis of movie dialogues
===

*Due: January 1 2023*

In this assignment you will analyze how the expression of Hope in movie dialogues changed with time.

## Imports

In [103]:
import os
from nltk.corpus import wordnet as wn
import pandas as pd
import ipywidgets as widgets
import itertools
import nltk
import regex as re
import contractions
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models.word2vec import Word2Vec


lemma = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))
layout = widgets.Layout(width="auto")
lang = "eng"
data_path = "data/dialogs_preprocessed2/"

## Functions

### Functions for text preprocessing

In [94]:
def clean_url(input):
    output = re.sub(r"http\S+", "", input)
    return output


def fix_contraction(input):
    output = contractions.fix(input)
    return output


def clean_non_alphanumeric(input):
    output = re.sub(r"[^a-zA-Z0-9]", " ", input)
    return output


def clean_tokenization(input):
    output = nltk.word_tokenize(input)
    return output


def clean_stopwords(input):
    output = [item for item in input if item not in stop_words]
    return output


def numbers_to_words(input):
    output = []
    for item in input:
        if item.isnumeric() == True:
            output += [p.number_to_words(item)]
        else:
            output += [item]
    return output


def clean_lowercase(input):
    output = str(input).lower()
    return output


def clean_lemmatization(input):
    output = [lemma.lemmatize(word=w, pos="v") for w in input]
    return output


def clean_length(input):
    output = [word for word in input if len(word) > 2]
    return output


def convert_to_string(input):
    output = " ".join(input)
    return output


def preprocessing(text, remove_stopwords=True):
    """
    Preprocessing pipeline.
    """
    text = clean_url(text)
    text = fix_contraction(text)
    text = clean_non_alphanumeric(text)
    text = clean_lowercase(text)
    text = clean_tokenization(text)
    text = numbers_to_words(text)
    if remove_stopwords:
        text = clean_stopwords(text)
    text = clean_lemmatization(text)
    text = clean_length(text)
    text = convert_to_string(text)
    return text

### Functions for wordnet

In [80]:
def generate_word_list(seed_word, language):
    """
    Takes in a single seed word and returns
    a word list, the length and a list of synsets
    """

    ## we create an empty list to store the final word list
    list_of_lemmas = []
    list_of_meanings = []

    ## a function to add a word to a list
    add_to_list = lambda list1, item1: list1.append(item1)

    ## a function to return the hyponyms of a synset
    hypos = lambda s: s.hyponyms()

    ## wn.synset obtains the list of synonyms and meanings for that word, in different syntactic categories
    meanings = wn.synsets(seed_word, pos=wn.NOUN + wn.VERB + wn.ADJ)

    ## loop over set of meanings in synset
    for meaning in meanings:

        ## add synset, definition, and a list of all associated lemmas into the list_of_meanings
        list_of_meanings += [
            [
                meaning,
                meaning.definition(),
                [lemma.name() for lemma in meaning.lemmas(language)],
            ]
        ]

        ## append all synonyms (lemmas()) of that meaning to the list_of_lemmas
        [
            add_to_list(list_of_lemmas, lemma.name())
            for lemma in meaning.lemmas(language)
        ]

        ## loop over the list of all possible hyponyms
        for hyponym in meaning.closure(hypos):

            ## add synsets, definition, and a list of all associated lemmas into the list_of_meanings
            list_of_meanings += [
                [
                    hyponym,
                    hyponym.definition(),
                    [lemma.name() for lemma in hyponym.lemmas(language)],
                ]
            ]

            ## append all synonyms (lemmas()) of that hyponym to the list_of_lemmas
            [
                add_to_list(list_of_lemmas, lemma.name())
                for lemma in hyponym.lemmas(language)
            ]

    ##eliminate list duplications by applying the set transformation
    set_of_lemmas = [*set(list_of_lemmas)]

    ## sort alphabetically
    set_of_lemmas.sort()

    ##length
    length = len(set_of_lemmas)

    return (set_of_lemmas, length, list_of_meanings)


def prune_list(w, syn_list):
    """
    Takes in a container checkbox widget,
    a list with synsets and returns
    a filtered word list.
    """

    filtered_meanings = list(
        itertools.compress(syn_list, [widget.value for widget in w.children])
    )

    filtered_list = [word for lemmas in filtered_meanings for word in lemmas[2]]

    filtered_list = sorted([*set(filtered_list)])

    return filtered_list


def expand_meanings(seed_words, language):
    """
    Takes in a list of seed words and returns all synsets.
    """

    list_meanings = []

    for sw in seed_words:
        _, _, meanings = generate_word_list(sw, language)
        list_meanings += meanings

    list_meanings.sort()

    # groupby also eliminates duplications
    list(list_meanings for list_meanings, _ in itertools.groupby(list_meanings))

    return list_meanings

### Functions for vec2word

In [154]:
def literary_words_list(root_folder):
    """

    Function to select .txt files and store them as a list of paragraphs,
    each a list of words, to use as input to the function WordVec.

    Parameters
    ----------
    root_folder : a file path where .txt files are located
    e.g. '~/home/user/text_analyses'

    Returns
    -------
    A list of the paragraphs, each paragraph a list of words

    """

    words_list = []

    for path, subdirs, files in os.walk(root_folder):
        for file in files:
            if ".txt" in file and "model" not in file:
                # print(file)
                name = os.path.join(path, file)
                file_text = open(name, encoding="utf-8").read()
                text_list_paragraphs = file_text.split("\n")

                for paragraph in text_list_paragraphs:
                    paragraph = paragraph.replace("\r", "")
                    list_of_sentences = nltk.sent_tokenize(paragraph)

                    for sentence in list_of_sentences:
                        words_list += [preprocessing(sentence)]

    words_list = [w for w in words_list if w != ""]

    return words_list


def get_word2vec_list(word_list, model):
    """
    Function to use word2vec to inquiry about the 10 most similar semantically
    words to each seed word in word_list. Uses a pre-trained model to get the
    most similar words within the text.

    """

    list_of_word2vec_lists = []
    for word in word_list:
        try:

            ##
            list_vects = model.wv.most_similar([word], topn=10)

            new_list = []
            new_list += [word]
            for item in list_vects:
                word1 = item[0]
                new_list += [word1]

            # print(new_list)
            # print('\n')
            list_of_word2vec_lists += [new_list]

        except KeyError:
            continue

    return list_of_word2vec_lists

## 0. Expression of interest: Hope

In [69]:
seed_word = "hope"

## 1. Control measure: Irritation

Find control measure to contrast with "Hope"

In [70]:
ctrl_seed_word = "irritation"

## 2. Initial bags of seeds

Find psychometric tools or equivalent to generate initial bags of seeds, for both Hope and the control condition

In [71]:
seed_list = generate_word_list(seed_word, lang)
print(f"Length of lemma list is {seed_list[1]}")

Length of lemma list is 16


In [72]:
ctrl_seed_list = generate_word_list(control_seed_word, lang)
print(f"Length of lemma list is {ctrl_seed_list[1]}")

Length of lemma list is 29


## 3. Expand bag of seeds and get synonyms and hyponyms

Use WordNet tools to expand your bag of seeds and get synonyms and hyponyms, and start excluding words with unrelated meanings using the filters available.

### Hope

In [73]:
selection_widget = widgets.VBox(
    [
        widgets.Checkbox(
            value=True,
            description=str(item),
            disabled=False,
            indent=False,
            layout=layout,
        )
        for item in seed_list[2]
    ]
)

selection_widget

VBox(children=(Checkbox(value=True, description="[Synset('hope.n.01'), 'a specific instance of feeling hopeful…

In [74]:
seed_words = prune_list(selection_widget, seed_list[2])

print(seed_words)

['desire', 'encouragement', 'go_for', 'great_white_hope', 'hope', 'hopefulness', 'optimism', 'promise', 'rainbow', 'sanguineness', 'sanguinity', 'trust', 'white_hope']


#### Extend

In [75]:
synsets = expand_meanings(seed_words, lang)
len(synsets)

300

In [99]:
synsets[:3]

[[Synset('abetment.n.01'),
  'the verbal act of urging on',
  ['abetment', 'abettal', 'instigation']],
 [Synset('accept.v.03'),
  'give an affirmative reply to; respond favorably to',
  ['accept', 'consent', 'go_for']],
 [Synset('accept.v.07'),
  'tolerate or accommodate oneself to',
  ['accept', 'live_with', 'swallow']]]

### Irritation

In [57]:
ctrl_selection_widget = widgets.VBox(
    [
        widgets.Checkbox(
            value=True,
            description=str(item),
            disabled=False,
            indent=False,
            layout=layout,
        )
        for item in ctrl_seed_list[2]
    ]
)

ctrl_selection_widget

VBox(children=(Checkbox(value=True, description="[Synset('irritation.n.01'), 'the psychological state of being…

In [76]:
ctrl_seed_words = prune_list(ctrl_selection_widget, ctrl_seed_list[2])

print(ctrl_seed_words)

['aggravation', 'aggro', 'annoyance', 'annoying', 'botheration', 'bummer', 'discomfort', 'exacerbation', 'exasperation', 'excitation', 'huff', 'impatience', 'innervation', 'irritation', 'last_straw', 'miff', 'pinprick', 'pique', 'provocation', 'red_flag', 'restlessness', 'seeing_red', 'snit', 'soreness', 'taunt', 'taunting', 'temper', 'twit', 'vexation']


#### Extend

In [77]:
ctrl_synsets = expand_meanings(ctrl_seed_words, lang)
len(ctrl_synsets)

208

In [98]:
ctrl_synsets[:3]

[[Synset('abatable_nuisance.n.01'),
  'a nuisance that can remedied (suppressed or extinguished or rendered harmless)',
  ['abatable_nuisance']],
 [Synset('abatable_nuisance.n.01'),
  'a nuisance that can remedied (suppressed or extinguished or rendered harmless)',
  ['abatable_nuisance']],
 [Synset('aggravation.n.01'),
  'an exasperated feeling of annoyance',
  ['aggravation', 'exasperation']]]

## 4. Train semantic vector space model

Train a semantic vector space model using word2vec

In [153]:
word2vec_input = literary_words_list(data_path)

<div class="alert alert-warning">Is this following step necessary? Otherwise it wouldn't work. If yes, why not including it into the function literary_words_list?</div>

In [156]:
# each sentence to list -> necessary?
word2vec_input_list = [s.split() for s in word2vec_input]

In [157]:
word2vec_output = Word2Vec(word2vec_input_list, min_count=1)

## Save vector space
word2vec_output.save("w2v_model.txt")
model = Word2Vec.load("w2v_model.txt")

<div class="alert alert-warning">Should the output look like this? A list of lists?<div>

In [166]:
check_vector_space = get_word2vec_list(filtered_list, model)
check_vector_space[:2]

[['desire',
  'belief',
  'guilt',
  'wisdom',
  'destiny',
  'painful',
  'nature',
  'ability',
  'passion',
  'mortal',
  'fate'],
 ['encouragement',
  'placate',
  'constable',
  'knowi',
  'caution',
  'vagina',
  'lucian',
  'rant',
  'resolute',
  'bumstead',
  'astound']]

## 5. Semantic clouds

Use your vector space model to find out the semantic clouds of each word in your bag of seeds, and select only the words with semantically meaningful clouds

In [1]:
#
#
#

## 6. Final bag of rods and word frequencies

Take the final Bag of Words, prune it further if necessary, and use the second script to calculate word frequencies (don't forget to upload the text files)

In [1]:
#
#
#

## 7. Relevant ratio, visualization of time series

Calculate your psychological relevant ratio, build a dataframe and plot a time series using Seaborn

In [1]:
#
#
#