![AOY Logo](https://raw.githubusercontent.com/BrockDSL/AOYTK/main/AOY_Logo.png) 

# All Our Yesterdays

A [toolkit](https://brockdsl.github.io/AOTYK) to explore web archives.


## Topic Modelling

This notebook will open a derivative csv file and go through the basic setup and execution of an LDA topic model using SciKit

In [None]:
print("Setting up AOYTK please wait...")
!wget "https://raw.githubusercontent.com/BrockDSL/AOYTK/tim_branch/aoytk.py"

#LDA Specific
!pip -q install pyLDAvis
!pip -q install spacy
!python -m spacy download en_core_web_md
import aoytk
import spacy
import nltk
import gensim
import pandas as pd
import pyLDAvis
import warnings
import pickle
import os
import pyLDAvis.gensim_models
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from nltk.corpus import stopwords
from google.colab import drive
from IPython.display import clear_output

warnings.filterwarnings("ignore", category=DeprecationWarning)


drive.mount("/content/drive/")
clear_output()
print("Ready to proceed.")


In [None]:
#These two lines are for dev
import importlib
importlib.reload(aoytk)

#Set up pieces for AOYTK object
atk = aoytk.Analyzer()
atk.load_data()
atk.set_LDA_model_topics()

In [None]:
#Run this cell if you want to see the whole derivative
atk.data


## Prepare Text


In [None]:
#What parts of speech to keep?
ALLOWED_POSTAGS = [
    "ADJ",
    "ADP",
    "ADV",
    "AUX",
    "CONJ"
    "CCONJ",
    "DET",
    "INTJ",
    "NOUN",
#    "PART",
    "PRON",
#    "PROPN",
    "SCONJ",
    "VERB"
]

#If you have extra stopwords put them here
EXTRA_STOPWORDS = [
]


#Stop words
nltk.download('stopwords')
stop_words = stopwords.words('english')
for word in EXTRA_STOPWORDS:
  stop_words.append(word)

nlp = spacy.load('en_core_web_md', disable=['parser', 'ner'])

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def lemmatization(texts, allowed_postags):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

data = atk.data.dropna(subset=['content'])
data = data[data.language == 'en'].content.values.tolist()

data_words = list(sent_to_words(data))
data_words_nostops = remove_stopwords(data_words)
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=ALLOWED_POSTAGS)



id2word = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]

print("\nDone Prepping text!")


## Build Model

In [None]:
#Model parameters

NUM_TOPICS = atk.number_LDA_Topics # change if you want to set to arbitrary value

ALPHA ='auto'             #default 'symmetric'
CHUNKSIZE = 100           #default 2000
COHERENCE_METHOD = 'c_v'
ITERATIONS = 200          #default 50
PASSES = 10               #default 1
PER_WORD_TOPICS = False   #default False
RANDOM_STATE = 100
UPDATE_EVERY = 1          #default 1
TOPICS_TO_SHOW = 15

#Generate Model and get basic dynamics of it
print("Building model... This may take serveral minutes")
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,\
                                                id2word=id2word,\
                                                num_topics=NUM_TOPICS,\
                                                random_state = RANDOM_STATE,\
                                                update_every = UPDATE_EVERY,\
                                                chunksize = CHUNKSIZE,\
                                                passes = PASSES,\
                                                alpha = ALPHA,\
                                                iterations = ITERATIONS,\
                                                per_word_topics = PER_WORD_TOPICS)

pyLDAvis.enable_notebook()
#Projection parameter
# tsne
# mmds
# pcoa
vis = pyLDAvis.gensim_models.prepare(lda_model,corpus,id2word,mds="mmds")
vis


# Save Model

In [None]:
#model files will be_prefaced with this
model_name = "niagara_sample"

try:
  os.mkdir('models')
except:
  pass

pickle.dump(texts,open(aoytk.path+"models/"+model_name+"_text.pkl","wb"))
#Corpus
pickle.dump(corpus,open(aoytk.path"models/"+model_name+"_corpus.pkl","wb"))
#Dictionary
pickle.dump(id2word,open(aoytk.path"models/"+model_name+"_id2word.pkl","wb"))

print("Models successfully save to: "+aoytk.path+"models/"+model_name+"_*.*")