In [498]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
from numpy.random import multinomial

import re

In [499]:
import matplotlib.pyplot as plt
%matplotlib inline

In [500]:
n = 2
startchar = "<"
endchar = ">"
print(f'Startchar: "{startchar}"')
print(f'Endchar: "{endchar}"')

Startchar: "<"
Endchar: ">"


In [501]:
def load_dataset(file: str, column_name: str):
    dataset = pd.read_csv(file)
    dataset = dataset[column_name].to_numpy()
    return dataset

def prep_data(rawdata: np.ndarray, n=4): 
    for i, sample in tqdm(enumerate(rawdata)):
        sample = ((startchar + " ")*n) + sample + ((" " + endchar)*n)
        rawdata[i] = sample

    print(rawdata[0])
    
    return rawdata

In [502]:
data = load_dataset(open("datasets/Emusk_2021_tweets.csv", errors='ignore'), "Text").astype(str)

In [503]:
data = prep_data(data, n=n)

2993it [00:00, 376813.80it/s]

< < @PPathole Dojo isn’t needed, but will make self-driving better. It isn’t enough to be safer than human drivers, Autopilot ultimately needs to be more than 10 times safer than human drivers. > >





In [504]:
def get_unique_ngrams(prepped_data: np.ndarray, n=4):
    unique_ngrams = set()
    
    for sample in tqdm(prepped_data):
        words = sample.split()
        for i in range(len(words) - (n-1)):
            ngram = " ".join(words[i:i+n])
            unique_ngrams.add(ngram)
    print(f"Number of unique {n}-grams:", len(unique_ngrams))
    return unique_ngrams

In [505]:
unique_ngrams = get_unique_ngrams(data, n=n)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2993/2993 [00:00<00:00, 163713.51it/s]

Number of unique 2-grams: 29596





In [506]:
def count_ngram_appearances(prepped_data: 
                            np.ndarray, 
                            unique_ngrams: set, 
                            n=4,
                            smoothing_factor=1):
    
    ngram_counts = defaultdict(dict)
    total_unique_ngram_count = len(unique_ngrams)
    total_word_count = len(unique_words)
    
    for sample in tqdm(prepped_data):
        words = sample.split(" ")
        wordcount = len(words)
        
        for i in range(wordcount - n):
            ngram = " ".join(words[i:i+n])
            next_word = words[i+n]
            
            if next_word in ngram_counts[ngram]:
                ngram_counts[ngram][next_word] += 1
            else:
                ngram_counts[ngram][next_word] = smoothing_factor # smoothing_factor may not be working correctly
                      
    return ngram_counts

In [507]:
ngram_counts = count_ngram_appearances(data, unique_ngrams, n=n, smoothing_factor=1)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2993/2993 [00:00<00:00, 117805.15it/s]


In [508]:
def calculate_ngram_probabilities(ngram_counts: defaultdict(dict)):
    # calculate probabilities
    # key = ngram
    # value = dict { nextword: probability }
    #print(sum(list(ngram_probabilities.items())[0][1].values()))
    ngram_probabilities = ngram_counts

    for ngram, next_words in tqdm(ngram_probabilities.items()):
        total_count = sum(next_words.values())
        for next_word in next_words:
            ngram_probabilities[ngram][next_word] /= total_count
       
    return ngram_probabilities              

In [509]:
ngram_probs = calculate_ngram_probabilities(ngram_counts)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29529/29529 [00:00<00:00, 2019890.13it/s]


In [510]:
def generate(ngram_probabilities: defaultdict(dict)):

    out = []
    for i in range(n): out.append("<")
    lastn = ' '.join(out[-n])

    while True:
        lastn = ' '.join(out[-n:])
        #print(lastn)
        probs = ngram_probs[lastn]
        idx = multinomial(1, [*probs.values()]).argmax()
        word = [*probs.keys()][idx]
        if word == endchar: break
        out.append([*probs.keys()][idx])
        
    print(' '.join(out[n:]))

In [529]:
gen = generate(ngram_probs)

SpaceX launching satellite Doge-1 to the moon at all costs


In [530]:
for d in data:
    if "SpaceX launching satellite" in d: print(d)

< < SpaceX launching satellite Doge-1 to the moon next year

– Mission paid for in Doge
– 1st crypto in space
– 1st meme in space

To the mooooonnn!!

https://t.co/xXfjGZVeUW > >
