In [303]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
from numpy.random import multinomial

import re

In [304]:
import matplotlib.pyplot as plt
%matplotlib inline

In [305]:
n = 3
startchar = "<"
endchar = ">"
print(f'Startchar: "{startchar}"')
print(f'Endchar: "{endchar}"')

Startchar: "<"
Endchar: ">"


In [306]:
def load_dataset(file: str, column_name: str):
    dataset = pd.read_csv(file)
    dataset = dataset[column_name].to_numpy()
    return dataset

def prep_data(rawdata: np.ndarray, n=4): 
    for i, sample in tqdm(enumerate(rawdata)):
        sample = ((startchar + " ")*n) + sample + ((" " + endchar)*n)
        rawdata[i] = sample

    print(rawdata[0])
    
    return rawdata

In [307]:
data = load_dataset("datasets/wiki_movie_plots_deduped.csv", "Plot")
data = prep_data(data, n=n)

34886it [00:00, 1322485.94it/s]

< < < A bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave.[1] > > >





In [308]:
def get_unique_ngrams(prepped_data: np.ndarray, n=4):
    unique_ngrams = set()
    
    for sample in tqdm(prepped_data):
        words = sample.split()
        for i in range(len(words) - (n-1)):
            ngram = " ".join(words[i:i+n])
            unique_ngrams.add(ngram)
    print(f"Number of unique {n}-grams:", len(unique_ngrams))
    return unique_ngrams

In [309]:
unique_ngrams = get_unique_ngrams(data, n=n)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 34886/34886 [00:04<00:00, 7305.77it/s]


Number of unique 3-grams: 8420645


In [310]:
def count_ngram_appearances(prepped_data: 
                            np.ndarray, 
                            unique_ngrams: set, 
                            n=4,
                            smoothing_factor=1):
    
    ngram_counts = defaultdict(dict)
    total_unique_ngram_count = len(unique_ngrams)
    total_word_count = len(unique_words)
    
    for sample in tqdm(prepped_data):
        words = sample.split(" ")
        wordcount = len(words)
        
        for i in range(wordcount - n):
            ngram = " ".join(words[i:i+n])
            next_word = words[i+n]
            
            if next_word in ngram_counts[ngram]:
                ngram_counts[ngram][next_word] += 1
            else:
                ngram_counts[ngram][next_word] = smoothing_factor
                      
    return ngram_counts


In [311]:
ngram_counts = count_ngram_appearances(data, unique_ngrams, n=n)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 34886/34886 [00:09<00:00, 3661.46it/s]


In [312]:
def calculate_ngram_probabilities(ngram_counts: defaultdict(dict)):
    # calculate probabilities
    # key = ngram
    # value = dict { nextword: probability }
    #print(sum(list(ngram_probabilities.items())[0][1].values()))
    ngram_probabilities = ngram_counts

    for ngram, next_words in tqdm(ngram_probabilities.items()):
        total_count = sum(next_words.values())
        for next_word in next_words:
            ngram_probabilities[ngram][next_word] /= total_count
       
    return ngram_probabilities              

In [313]:
ngram_probs = calculate_ngram_probabilities(ngram_counts)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8414363/8414363 [00:04<00:00, 1724974.73it/s]


In [317]:
def generate(ngram_probabilities: defaultdict(dict)):

    out = []
    for i in range(n): out.append("<")
    lastn = ' '.join(out[-n])

    while True:
        lastn = ' '.join(out[-n:])
        #print(lastn)
        probs = ngram_probs[lastn]
        idx = multinomial(1, [*probs.values()]).argmax()
        word = [*probs.keys()][idx]
        if word == endchar: break
        out.append([*probs.keys()][idx])
        
    print(' '.join(out[n:]))

In [322]:
generate(ngram_probs)

Because the film has been shown in the opening shot, followed by their first kiss. Later, Danielle catches Rodmilla and Marguerite are sent to intercept the fireship and promising pardons to all who doubted the couple as Marco's parents. The hope was for them to turn Pettigrew over to the skeleton, and with a carpet sweeper. He bumps into an attractive backpacker, who introduces herself as Lissa. She is soon dead.


In [324]:
for i in data:
    if "Because the film has" in i:
        print(i)

< < < Because the film has been lost, the following summary is reconstructed from a description in a contemporary film magazine.
Cleopatra (Bara), the Siren of Egypt, by a clever ruse reaches Caesar (Leiber) and he falls victim to her charms. They plan to rule the world together, but then Caesar falls. Cleopatra's life is desired by the church, as the wanton woman's rule has become intolerable. Pharon (Roscoe), a high priest, is given a sacred dagger to take her life. He gives her his love instead and, when she is in need of some money, leads her to the tomb of his ancestors, where she tears the treasure from the breast of the mummy. With this wealth she goes to Rome to meet Antony (Hall). He leaves the affairs of state and travels to Alexandria with her, where they revel. Antony is recalled to Rome and married to Octavia (Blinn), but his soul cries out for Cleopatra. He sends her a message to arm her ships and meet him at Actium, where they battle the opposing forces. They are overpow