In [1]:
import string
import re
import pandas as pd

import nltk
import pyLDAvis as pyLDAvis
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')

from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.metrics import classification_report

from gensim.models import LdaMulticore, CoherenceModel, LdaModel

import gensim
from gensim import corpora

from tqdm import tqdm

import matplotlib.pyplot as plt

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

pyLDAvis.enable_notebook()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\darin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\darin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\darin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\darin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
lemmatizer = WordNetLemmatizer()
stopwords = set(stopwords.words('english'))

In [3]:
def preprocessor(row: str):
    row = re.sub("[@#][A-Za-z0-9]+", "", row)
    row = re.sub(f"[{string.punctuation}]", "", row)
    row = row.lower()
    row = [lemmatizer.lemmatize(word) for word in word_tokenize(row) if lemmatizer.lemmatize(word) not in stopwords]
    return " ".join(row)

In [4]:
df = pd.read_csv('data/covid19_tweets.csv')

df = df[['text']]
df['text'] = df['text'].apply(preprocessor)

df

Unnamed: 0,text
0,smelled scent hand sanitizers today someone pa...
1,hey wouldnt made sense player pay respect a… h...
2,trump never claimed wa hoax claim effort to… h...
3,one gift ha give appreciation simple thing alw...
4,25 july medium bulletin novel … httpstcomn0eec...
...,...
179103,thanks nominating challenge nominate … httpstc...
179104,2020 year insanity lol httpstcoy48np0yzgn
179105,powerful painting juan lucena tribute grandpar...
179106,1200 student test positive major university ab...


In [5]:
docs = df['text'].apply(lambda row: row.split(' ')).tolist()

In [6]:
dictionary = corpora.Dictionary(docs)
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [None]:
coherence_values = []

for t in tqdm(range(1, 20)):
    model = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=t, random_state=200, workers=4)
    coherence_model = CoherenceModel(model=model, texts=docs, dictionary=dictionary, coherence='c_v')
    coherence_values.append(coherence_model.get_coherence())

 42%|████▏     | 8/19 [06:49<09:39, 52.70s/it]

In [None]:
plt.plot([_ for _ in range(1, 20)], coherence_values)
plt.xticks([_ for _ in range(1, 20)])
plt.grid()
plt.show()

In [None]:
model = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=2, workers=4)

In [None]:
vis = gensimvis.prepare(model, corpus, dictionary)

In [None]:
vis

In [None]:
df

In [None]:
smpl = df.sample(5000)
smpl

In [None]:
import torch, os, re, pandas as pd, json
from sklearn.model_selection import train_test_split
from transformers import DataCollatorForLanguageModeling, DataCollatorWithPadding, GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, AutoConfig
from datasets import Dataset

In [None]:
if torch.cuda.is_available():
    dev = "cuda:0"
else:
    dev = "cpu"
device = torch.device(dev)

In [None]:
device

In [None]:
base_model = GPT2LMHeadModel.from_pretrained('gpt2')
base_model.num_parameters

In [None]:
base_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

bos = '<|endoftext|>'
eos = '<|EOS|>'
pad = '<|pad|>'

special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': pad}
num_added_toks = base_tokenizer.add_special_tokens(special_tokens_dict)
config = AutoConfig.from_pretrained('gpt2',
                                    bos_token_id=base_tokenizer.bos_token_id,
                                    eos_token_id=base_tokenizer.eos_token_id,
                                    pad_token_id=base_tokenizer.pad_token_id,
                                    output_hidden_states=False)
base_model = GPT2LMHeadModel.from_pretrained('gpt2', config=config)
base_model.resize_token_embeddings(len(base_tokenizer))

smpl['text'] = bos + ' ' + smpl['text'] + ' ' + eos
df_train, df_val = train_test_split(smpl, train_size=0.8)

train_dataset = Dataset.from_pandas(df_train[['text']])
val_dataset = Dataset.from_pandas(df_val[['text']])


def tokenize_function(df, base_tokenizer=base_tokenizer):
    return base_tokenizer(df['text'], padding=True)

In [None]:
tokenized_train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=5,
    remove_columns=['text'],
)
tokenized_val_dataset = val_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=5,
    remove_columns=['text'],
)

In [None]:
from transformers import TrainingArguments, Trainer

model_path = './model_tw'

training_args = TrainingArguments(
  output_dir=model_path,
  num_train_epochs=6,
  per_device_train_batch_size=32,
  per_device_eval_batch_size=16,
  warmup_steps=200,
  weight_decay=0.01,
  logging_dir=model_path,
  prediction_loss_only=True,
  save_steps=10000
)

data_collator = DataCollatorForLanguageModeling(
  tokenizer=base_tokenizer,
  mlm=False
)

trainer = Trainer(
  model=base_model,
  args=training_args,
  data_collator=data_collator,
  train_dataset=tokenized_train_dataset,
  eval_dataset=tokenized_val_dataset
)

trainer.train()

In [None]:
trainer.save_model()
base_tokenizer.save_pretrained(model_path)

In [None]:
trainer.evaluate()

In [None]:
tweets_model = GPT2LMHeadModel.from_pretrained(model_path)
tweets_tokenizer = GPT2Tokenizer.from_pretrained(model_path)

In [None]:
def generate_n_text_samples(model, tokenizer, input_text, device, n_samples = 5):
    text_ids = tokenizer.encode(input_text, return_tensors = 'pt')
    text_ids = text_ids.to(device)
    model = model.to(device)

    generated_text_samples = model.generate(
        text_ids,
        max_length= 100,
        num_return_sequences= n_samples,
        no_repeat_ngram_size= 2,
        repetition_penalty= 1.5,
        top_p= 0.92,
        temperature= .85,
        do_sample= True,
        top_k= 125,
        early_stopping= True
    )
    gen_text = []
    for t in generated_text_samples:
        text = tokenizer.decode(t, skip_special_tokens=True)
        gen_text.append(text)

        return gen_text

In [None]:
input_text = 'covid'
generated_tweet = generate_n_text_samples(tweets_model, tweets_tokenizer, input_text, device)

generated_tweet