In [1]:
import numpy as np
import os
from tqdm import tqdm

In [14]:
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

nltk.download('wordnet')
from nltk.corpus import wordnet as wn

from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package punkt to /home/dyco/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/dyco/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/dyco/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


We don't want stopwords as they would probably just be noise:

In [3]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/dyco/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
import gensim
from gensim import corpora

In [5]:
DATA_PATH = 'data'

Load summaries:

In [6]:
summaries = []
with open(os.path.join(DATA_PATH, 'plot_summaries.txt')) as ps:
    ps_text = ps.readlines()
    for line in ps_text[0:1000]:
        summaries.append(line.split('\t'))

Create dictionaries to extract nouns, verbs and descriptions (adverbs and adjectives):

In [7]:
is_noun = lambda pos: pos.startswith('NN')
nouns = {}

is_verb = lambda pos: pos.startswith('VB')
verbs = {}

is_desc = lambda pos: pos.startswith('RB') or pos.startswith('JJ')
descriptions = {}

Function to select only certain types of words (verbs, nouns or descriptions) from a list of tokens:

In [8]:
def get_pos_words(tokenized_text, pos_selection_function):
    return [word.lower()
            for (word, pos) in nltk.pos_tag(tokenized_text)
            if pos_selection_function(pos) and len(word) > 3]

Extract lemmas from words:

In [9]:
def get_lemmas(words):
    lemmas = []
    for word in words:
        lemma = wn.morphy(word)
        if lemma is None and word not in en_stop:
            lemmas.append(word)
        elif lemma is not None and lemma not in en_stop:
            lemmas.append(lemma)
    return lemmas

Call previously defined functions to prepate lists of verbs, nouns and descriptions:

In [10]:
def prepare_text(summary, nouns, verbs, descs):
    tokenized = nltk.word_tokenize(summary[1])
    
    nouns[summary[0]] =  get_pos_words(tokenized, is_noun)
    nouns[summary[0]] = get_lemmas(nouns[summary[0]])

    verbs[summary[0]] =  get_pos_words(tokenized, is_verb)
    verbs[summary[0]] = get_lemmas(verbs[summary[0]])

    descs[summary[0]] =  get_pos_words(tokenized, is_desc)
    descs[summary[0]] = get_lemmas(descs[summary[0]])

Process the data:

In [11]:
for summary in tqdm(summaries):
    prepare_text(summary, nouns, verbs, descriptions)

100%|██████████| 1000/1000 [00:24<00:00, 40.67it/s]


In [29]:
def extract_tf_idf_features(dict_of_docs):
    raw_texts = [" ".join(words) for words in list(dict_of_docs.values())]
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(raw_texts)
    feature_names = tfidf_vectorizer.get_feature_names_out()
    mean_tfidf_scores = tfidf_matrix.mean(axis=0).tolist()[0]

    # Zip feature names with mean TF-IDF scores
    term_tfidf_pairs = zip(feature_names, mean_tfidf_scores)

    # Sort terms by mean TF-IDF score in descending order
    sorted_terms = sorted(term_tfidf_pairs, key=lambda x: x[1], reverse=True)

    # Set a threshold (e.g., keep top 80% of terms)
    threshold = 0.2
    selected_terms = [term for term, tfidf_score in sorted_terms if tfidf_score >= threshold]
    print(selected_terms)

    return selected_terms


for x in extract_tf_idf(nouns):
    print(x)



TypeError: 'NoneType' object is not iterable

How's it going?

In [12]:
i=3
print(len(nouns[summaries[i][0]]))
print(verbs[summaries[i][0]])
print(descriptions[summaries[i][0]])

222
['come', 'intend', 'convince', 'switch', 'employ', 'doe', 'demand', 'provide', 'make', 'decide', 'return', 'come', 'try', 'make', 'visit', 'past', 'falls', 'doe', 'leaf', 'give', 'notice', 'thinking', 'begin', 'collecting', 'fail', 'recognize', 'remember', 'land', 'convict', 'collecting', 'sentence', 'learn', 'go', 'arrive', 'bail', 'set', 'restart', 'need', 'represent', 'receive', 'remember', 'deny', 'organize', 'charm', 'convert', 'abandon', 'makeshift', 'complete', 'receive', 'collect', 'plan', 'keep', 'netting', 'decide', 'leave', 'inform', 'seeing', 'decide', 'muscle', 'reasoning', 'kidnap', 'move', 'learn', 'return', 'find', 'desert', 'go', 'clue', 'reveal', 'move', 'confront', 'manage', 'slip', 'track', 'voice', 'stewing', 'surprise', 'meet', 'escape', 'decide', 'recover', 'sneak', 'find', 'move', 'using', 'heighten', 'enter', 'make', 'avoid', 'sent', 'ensue', 'allow', 'escape', 'return', 'meet', 'appear', 'arrive', 'demand', 'leave', 'turn', 'hitting', 'revealing', 'occupy'

In [129]:
dictionary = corpora.Dictionary(list(verbs.values()))
corpus = [dictionary.doc2bow(text) for text in list(verbs.values())]

In [133]:
NUM_TOPICS = 20
ldamodel = gensim.models.LdaMulticore(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.017*"come" + 0.015*"begin" + 0.014*"become" + 0.012*"make" + 0.011*"learn" + 0.010*"realize" + 0.009*"know" + 0.009*"help" + 0.008*"try" + 0.007*"turn"')
(1, '0.026*"find" + 0.021*"take" + 0.018*"tell" + 0.018*"go" + 0.018*"try" + 0.014*"make" + 0.012*"want" + 0.012*"doe" + 0.011*"ask" + 0.009*"return"')
(2, '0.024*"take" + 0.023*"come" + 0.017*"tell" + 0.014*"want" + 0.011*"become" + 0.011*"marry" + 0.010*"give" + 0.010*"show" + 0.010*"ask" + 0.009*"make"')
(3, '0.027*"marry" + 0.021*"come" + 0.019*"find" + 0.017*"go" + 0.016*"falls" + 0.015*"meet" + 0.012*"take" + 0.011*"want" + 0.009*"offer" + 0.008*"get"')
(4, '0.013*"come" + 0.013*"play" + 0.011*"take" + 0.011*"turn" + 0.010*"attempt" + 0.010*"give" + 0.010*"lightning" + 0.009*"begin" + 0.008*"cause" + 0.008*"doe"')
(5, '0.010*"take" + 0.009*"kill" + 0.008*"decide" + 0.007*"include" + 0.007*"left" + 0.007*"fail" + 0.007*"launch" + 0.007*"find" + 0.007*"sending" + 0.007*"attack"')
(6, '0.026*"kill" + 0.020*"find" + 0.017*"ta

In [131]:
print(verbs[summaries[i][0]])
new_doc_bow = dictionary.doc2bow(verbs[summaries[i][0]])
print(ldamodel.get_document_topics(new_doc_bow))

['come', 'intend', 'convince', 'switch', 'employ', 'doe', 'demand', 'provide', 'make', 'decide', 'return', 'come', 'try', 'make', 'visit', 'past', 'falls', 'doe', 'leaf', 'give', 'notice', 'thinking', 'begin', 'collecting', 'fail', 'recognize', 'remember', 'land', 'convict', 'collecting', 'sentence', 'learn', 'go', 'arrive', 'bail', 'set', 'restart', 'need', 'represent', 'receive', 'remember', 'deny', 'organize', 'charm', 'convert', 'abandon', 'makeshift', 'complete', 'receive', 'collect', 'plan', 'keep', 'netting', 'decide', 'leave', 'inform', 'seeing', 'decide', 'muscle', 'reasoning', 'kidnap', 'move', 'learn', 'return', 'find', 'desert', 'go', 'clue', 'reveal', 'move', 'confront', 'manage', 'slip', 'track', 'voice', 'stewing', 'surprise', 'meet', 'escape', 'decide', 'recover', 'sneak', 'find', 'move', 'using', 'heighten', 'enter', 'make', 'avoid', 'sent', 'ensue', 'allow', 'escape', 'return', 'meet', 'appear', 'arrive', 'demand', 'leave', 'turn', 'hitting', 'revealing', 'occupy', 'h

# Some experiments

In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove_model_path = 'models/glove.6B.50d.txt'
word2vec_output_path = 'models/glove.6B.50d.model'

# Convert GloVe to Word2Vec format
glove2word2vec(glove_model_path, word2vec_output_path)


  glove2word2vec(glove_model_path, word2vec_output_path)


(400000, 50)

In [None]:
from gensim.models import KeyedVectors
from sklearn.cluster import KMeans
import numpy as np

# Assume 'word_list' is your list of words
word_list = ['shoot', 'stab', 'slice', 'run', 'jump', 'swim', 'kill', 'play']

# Load pre-trained Word2Vec model (example, you may use a more extensive model)
model = KeyedVectors.load_word2vec_format('models/glove.6B.50d.model', binary=False)

# Get word vectors
word_vectors = [model[word] for word in word_list]

# Apply k-means clustering
num_clusters = 2  # Adjust based on your needs
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(word_vectors)

# Get cluster assignments for each word
cluster_assignments = kmeans.labels_

# Print clusters
for word, cluster in zip(word_list, cluster_assignments):
    print(f"{word}: Cluster {cluster}")

shoot: Cluster 1
stab: Cluster 1
slice: Cluster 1
run: Cluster 0
jump: Cluster 0
swim: Cluster 0
kill: Cluster 1
play: Cluster 0


  super()._check_params_vs_input(X, default_n_init=10)
