In [1]:
# Prepare the MPST corpus to convert the plot synopses into sequences

In [2]:
import os
import json
import torch
import operator
import numpy as np

In [19]:
DATA_PATH = '../../data/MPST/'
PROCESSED_DATA_PATH = '../../processed_data/new/'

In [4]:
os.listdir(DATA_PATH)

['test_ids.txt',
 'train_ids.txt',
 'tag_assignment_data',
 'LICENSE',
 'README.md',
 'id_title_tags.tsv',
 'final_plots_wiki_imdb_combined',
 'partition.json']

In [5]:
train_ids = open(DATA_PATH + 'train_ids.txt').read().split('\n')[:-1]
test_ids = open(DATA_PATH + 'test_ids.txt').read().split('\n')[:-1]

print('Train %d + Test %d = Total %d' % (len(train_ids), len(test_ids), (len(train_ids) + len(test_ids))))

Train 11862 + Test 2966 = Total 14828


In [6]:
all_imdb_list = train_ids + test_ids
len(all_imdb_list)

14828

# Process Tags

In [20]:
tag_list = open( DATA_PATH + 'tag_assignment_data/tag_list.txt').read().split('\n')[:-1]
print(len(tag_list))

71


In [21]:
index_to_tag = {i: tag for i, tag in enumerate(tag_list)}
tag_to_index = {tag: i for i, tag in enumerate(tag_list)}

In [22]:
json.dump(index_to_tag, open(PROCESSED_DATA_PATH + 'index_to_tag.json', 'w'))
json.dump(tag_to_index, open(PROCESSED_DATA_PATH + 'tag_to_index.json', 'w'))

In [23]:
movie_to_tags = json.load( open(DATA_PATH + 'tag_assignment_data/movie_to_label_name.json') )
print(movie_to_tags[train_ids[0]])

['cult', 'humor', 'murder', 'suspenseful']


In [26]:
imdb_to_binary_tag_dict = {}

# Train
for imdb_id in all_imdb_list:
    tags = movie_to_tags[imdb_id]
    binary_vector = [0] * len(tag_list)
    for t in tags:
        binary_vector[tag_to_index[t]] = 1
    imdb_to_binary_tag_dict[imdb_id] = binary_vector


In [27]:
json.dump(imdb_to_binary_tag_dict, open(PROCESSED_DATA_PATH + 'vectors/labels_binary_dict.json', 'w'))

# Process Plot Synopses
## Add stopword removal or other preprocessing later

In [28]:
all_texts_cleaned = {}

In [29]:
words = {}

for imdb_id in train_ids:
    plot_words = open(DATA_PATH + '/final_plots_wiki_imdb_combined/cleaned/{}.txt'.format(imdb_id)).read().lower().split()
    all_texts_cleaned[imdb_id] = plot_words
    for w in plot_words:
        if w not in words:
            words[w] = 0
        words[w] += 1

In [30]:
words_sorted_by_frequency = sorted(words.items(), key = operator.itemgetter(1), reverse=True)

In [31]:
len(words_sorted_by_frequency)

101221

In [32]:
print(len([v for k, v in words_sorted_by_frequency if v>10]))
#print([(k, v) for k, v in words_sorted_by_frequency if v<=100])

31517


In [164]:
VOCAB_5K_FREQ = words_sorted_by_frequency[:5000]
VOCAB_5K = [w for w, f in VOCAB_5K_FREQ]
word2idx = {w: i+1 for i, w in enumerate(VOCAB_5K)}
idx2word = {i+1: w for i, w in enumerate(VOCAB_5K)}
json.dump(VOCAB_5K, open(PROCESSED_DATA_PATH + 'vocab_5k_no_process.json', 'w'))
json.dump(word2idx, open(PROCESSED_DATA_PATH + 'word2idx_no_process.json', 'w'))
json.dump(idx2word, open(PROCESSED_DATA_PATH + 'idx2word_no_process.json', 'w'))

In [53]:
VOCAB_NEW_FREQ = words_sorted_by_frequency[:31517]
VOCAB_31k = [w for w, f in VOCAB_NEW_FREQ]
word2idx = {w: i+1 for i, w in enumerate(VOCAB_31k)}
idx2word = {i+1: w for i, w in enumerate(VOCAB_31k)}
json.dump(VOCAB_31k, open(PROCESSED_DATA_PATH + 'vocab_31k_no_process.json', 'w'))
json.dump(word2idx, open(PROCESSED_DATA_PATH + 'word2idx_no_process.json', 'w'))
json.dump(idx2word, open(PROCESSED_DATA_PATH + 'idx2word_no_process.json', 'w'))

## Convert Text to Sequence

In [54]:
# Load clean texts into dictionary
clean_text_dict = {}
for imdb_id in all_imdb_list:
    clean_text_dict[imdb_id] = open(DATA_PATH + '/final_plots_wiki_imdb_combined/cleaned/{}.txt'.format(imdb_id)).read().lower().split()

In [55]:
def text_to_sequence(word2idx, text):
    sequence = [word2idx[w] for w in plot_words if w in word2idx]
    return sequence

In [56]:
def get_sequence_len_stat(sequences):
    lens = [len(s) for s in sequences]
    print('Total %d, Min %d, Max %d, Mean %d, Median %d' %(len(lens), min(lens), max(lens), np.mean(lens), np.median(lens)))

In [63]:
all_sequences = {}
long_count = 0
for imdb_id, plot_words in all_texts_cleaned.items():
    sequence = text_to_sequence(word2idx, plot_words)
    all_sequences[imdb_id] = sequence
    if len(sequence) > 3000:
        long_count += 1
    
get_sequence_len_stat(list(all_sequences.values()))

json.dump(all_sequences, open( PROCESSED_DATA_PATH + 'all_sequence_dict.json', 'w') )
print('Dumped')

Total 11862, Min 47, Max 11668, Mean 876, Median 644
Dumped


In [64]:
long_count

361

# Pad sequences

The lists of sequences dumped have the same order of the ids we had in the train and test id files

In [47]:
def pad_one_sequence(seq,  max_len=1500, direction='left', value=0):
    seq_len = len(seq)
    pad_len = abs(max_len - seq_len)
    padded_sequence = [0] * max_len
    
    if seq_len < max_len:
        padding = [value] * (pad_len)
        if direction == 'left':
            padded_seq = padding + seq
        elif direction == 'right':
            padded_seq = seq + padding
        else:
            raise ValueError("Check the value of arguement 'direction'")
                
    elif seq_len > max_len:
        # Truncate
        if direction == 'left':
            padded_seq = seq[-max_len : ]
        elif direction == 'right':
            padded_seq = seq[: max_len]
        else:
            raise ValueError("Check the value of arguement 'direction'")
    
    return padded_sequence

In [65]:
padded_seq_dict = {}
max_len = 3000
direction = 'left'
value = 0

for imdb_id, sequence in all_sequences.items():
    padded_seq_dict[imdb_id] = pad_one_sequence(sequence, max_len, direction, value)

print(len(padded_seq_dict), len(padded_seq_dict[imdb_id]))
json.dump(padded_seq_dict, open(PROCESSED_DATA_PATH + 'vectors/padded_word_sequences_3000.json', 'w'))

11862 3000


# Compute and Dump Emotion Flow Data

In [129]:
import joblib

# NRC emotion lexicons as dictionary format. Key: Emotion type, Value: List of words
emotion_lexicons = joblib.load(PROCESSED_DATA_PATH + 'emotion_lexicons_dict.pkl')
emotion_types = ['anger', 'anticipation', 'disgust', 'fear', 'joy',
                     'negative', 'positive', 'sadness', 'surprise', 'trust']

# Get shortest and longest words length in the lexicon list
minl= 1000
maxl = -1
imdb_ids = []
segment_scores_list = []
plot_lengths = []

for k, v in emotion_lexicons.items():
    for word in v:
        if len(word) < minl:
            minl = len(word)
        if len(word) > maxl:
            maxl = len(word)

print(minl, maxl)

2 17


In [133]:
def get_chunks_of_emotional_scores(content, N=20):
    # TODO: STEMMING BASED
    segment_scores =[[0 for j in range(len(emotion_types))] for i in range(N)]

    # content = content.lower().split()
    segment_length = round(len(content) / N)

    for i in range(N):
        start_index = i * segment_length
        end_index   = (i+1) * segment_length

        for token in content[start_index:end_index]:
            if len(token)>= minl and len(token) <= maxl:
                for emo_idx in range(len(emotion_types)):
                    if token in  emotion_lexicons[emotion_types[emo_idx]]:
                        segment_scores[i][emo_idx] += 1

        total = sum(segment_scores[i])
        if total > 0:
            for j in range(len(segment_scores[i])):
                segment_scores[i][j] /= total
                segment_scores[i][j] *= 100

    return segment_scores

In [144]:
emotion_score_vectors_dict = {}

for imdb_id, clean_content in clean_text_dict.items():
    emotion_score_vectors_dict[imdb_id] = get_chunks_of_emotional_scores(clean_content, 20)

print(len(emotion_score_vectors_dict), len(emotion_score_vectors_dict[imdb_id]) )

14828 20


In [147]:
json.dump(emotion_score_vectors_dict, open(PROCESSED_DATA_PATH + 'vectors/emotion_score_dict_20_chunks.json', 'w'))

# Compute and Dump Class Weights

In [85]:
def list_tag_weights():
    tag_to_n_movies = {}
    
    for imdb_id in train_ids:
        tags = movie_to_tags[imdb_id]
        for tag in tags:
            tag_to_n_movies[tag] = tag_to_n_movies.get(tag, 0) + 1
            
    weights = [0] * len(tag_list)
    
    for tag, count in tag_to_n_movies.items():
        # weights[tag_to_index[tag]] = (len(train_ids) - count) / len(train_ids)
        # weights[tag_to_index[tag]] = len(train_ids)/ (count * len(train_ids))
        weights[tag_to_index[tag]] = len(train_ids)/ (71 * count)
    
    print(weights)
    json.dump(weights, open(PROCESSED_DATA_PATH + 'class_weights_sk.json', 'w'))

In [188]:
list_tag_weights()

[0.0002161227577263886, 0.0002844950213371266, 0.00042716787697565144, 0.00043122035360068997, 0.00047709923664122136, 0.0005099439061703213, 0.0006657789613848203, 0.0006706908115358819, 0.0011655011655011655, 0.00145985401459854, 0.0015384615384615385, 0.0015552099533437014, 0.001694915254237288, 0.0016666666666666668, 0.001869158878504673, 0.001984126984126984, 0.002, 0.0020920502092050207, 0.0022172949002217295, 0.002398081534772182, 0.0024390243902439024, 0.0024875621890547263, 0.0025380710659898475, 0.0027472527472527475, 0.002898550724637681, 0.002793296089385475, 0.00303951367781155, 0.003125, 0.0031545741324921135, 0.003355704697986577, 0.00411522633744856, 0.004464285714285714, 0.004807692307692308, 0.004672897196261682, 0.004901960784313725, 0.005494505494505495, 0.005208333333333333, 0.005714285714285714, 0.005847953216374269, 0.0058823529411764705, 0.006134969325153374, 0.006211180124223602, 0.005988023952095809, 0.00641025641025641, 0.006622516556291391, 0.0078125, 0.0074

In [86]:
list_tag_weights()

[0.03610772045282284, 0.04753070342395769, 0.07136711769979123, 0.07204416668185049, 0.07970917105687561, 0.0851965438731317, 0.11123197239361603, 0.11205259727378354, 0.19472077218556091, 0.24389842705870257, 0.2570314192849404, 0.2598295840360984, 0.28317020768679874, 0.2784507042253521, 0.3122811636172173, 0.33148893360160964, 0.3341408450704225, 0.34951971241676, 0.37044439586521344, 0.4006484952882764, 0.4074888354517348, 0.4155980660079882, 0.4240366054193179, 0.45898467729453646, 0.4842620943049602, 0.46667715791958453, 0.5078128344535296, 0.5220950704225352, 0.527036033234105, 0.560639001796011, 0.6875326030255607, 0.7458501006036218, 0.8032231852654388, 0.7807029090430433, 0.8189726594863297, 0.9179693545890729, 0.8701584507042254, 0.9546881287726359, 0.9770200148257969, 0.9827671913835957, 1.0249719173939342, 1.0377044877963433, 1.0004216918276123, 1.0709642470205851, 1.106426639306035, 1.305237676056338, 1.2375586854460094, 1.3473421172194457, 1.33656338028169, 1.41585103843

# Get Pretrained Embeddings

In [68]:
import gensim
def get_pretrained_embeddings(vocabulary_path, pre_trained_vector_path, dump_path, vector_dim=300, type='fasttext'):
    # Load vocabulary
    words = json.load(open(vocabulary_path, 'r'))
    vocab_size = len(words) + 1
    print('vocabulary size', vocab_size)

    # Load word embeddings and create embedding matrix
    print('Loading Embeddings')

    if type == 'fasttext':
        model= gensim.models.wrappers.FastText.load_fasttext_format(pre_trained_vector_path)
    elif type == 'word2vec':
        model = gensim.models.KeyedVectors.load_word2vec_format(pre_trained_vector_path, binary=True)

    print('Loaded')
    print(model['try'])

    embedding_matrix = np.zeros((vocab_size, vector_dim))
    counter = 0

    for i, word in words.items():
        embedding_vector = np.zeros(vector_dim)
        if word in model:
            print(i, counter)
            embedding_matrix[int(i)] = model[word]
            counter += 1
        else:
            embedding_matrix[int(i)] = embedding_vector

    json.dump(embedding_matrix.tolist(), open(dump_path, 'w'))

    print(len(embedding_matrix))
    print(embedding_matrix.shape)
    print('Coverage', (counter/vocab_size))

  from ._conv import register_converters as _register_converters
Using Theano backend.


In [72]:
emb_vocab_path = PROCESSED_DATA_PATH + '/idx2word_no_process.json'
emb_vocab_path = '/home/sk/SK/Works/NarrativeAnalysis/experiments/classification/features/sequence/vocabulary_index_to_word.json'
# Replace this path
pre_trained_vector_path = '/home/sk/SK/Works/resources/fasttext/wiki.en/wiki.en'
dump_path = PROCESSED_DATA_PATH + 'fasttext_vectors_5k_paper_model.json'
dump_path = '/home/sk/SK/Works/NarrativeAnalysis/experiments/classification/features/sequence/fasttext_vectors_5k_paper_model.json'

get_pretrained_embeddings(emb_vocab_path, pre_trained_vector_path, dump_path)

vocabulary size 13994
Loading Embeddings
Loaded
[-0.06831407  0.30915412  0.12180331 -0.22247837 -0.47748762 -0.12033118
 -0.19960941 -0.10533361 -0.2503069   0.19360551  0.18619625  0.1456981
 -0.281463   -0.00504994  0.4646648   0.22578108  0.19691055  0.23636696
  0.03521357  0.06550668 -0.18728039  0.12686458  0.0613632  -0.21359481
 -0.52767116 -0.30925015 -0.22214665 -0.14821781 -0.44144124  0.29986697
 -0.01606236  0.11623905 -0.28910562  0.2665176  -0.04483953 -0.2929797
  0.11571283  0.09822313  0.1814719  -0.22369501  0.00893341 -0.16172571
 -0.03157985  0.15298676  0.01418889  0.43996707 -0.01828363  0.02063813
 -0.36280861 -0.254769    0.02717102 -0.15333425  0.03007952 -0.1372713
 -0.24108402  0.29698482 -0.00568777 -0.11870692 -0.0259715   0.1788373
 -0.03861186 -0.22213067  0.30692723  0.16983613 -0.08305422 -0.01824462
  0.00250198  0.38897112 -0.22285722  0.05966283  0.03053282  0.08914272
 -0.09346877 -0.34166712  0.05875046  0.07086216  0.33161137  0.38900018
  0.102

In [74]:
ft_model= gensim.models.wrappers.FastText.load_fasttext_format(pre_trained_vector_path)

In [80]:
v = ft_model['vutum']
max(v), min(v), sum(v)

(0.895751081449403, -1.17576014385982, 1.4546058026012736)

In [81]:
v = ft_model['apple']
max(v), min(v), sum(v)

(0.98430365, -0.6563826, 7.806553833535872)

In [83]:
del ft_model

In [84]:
ft_model['ty']

NameError: name 'ft_model' is not defined