In [None]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("../")


In [None]:
import pickle

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
# Lazy data reader into DataFrame
def read_argument_captions():
    transcripts_reader = pd.read_csv("data/captions_arguments.csv", chunksize=10)
    for batch in transcripts_reader:
        for ix, caption in batch.iterrows():
            text = ""
            for fragment, argument_label in zip(str(caption["content"]).split("\n"), str(caption["argument_labels"]).split("\n")):
                if argument_label:
                    text += fragment + " "
            yield text


In [None]:
# Training a tfidf vectorizer
vectorizer = CountVectorizer(stop_words="english")
matrix = vectorizer.fit_transform(read_argument_captions())
feature_names = vectorizer.get_feature_names()

In [None]:
with open("models/vectorizer.pkl", "rb") as count_file:
    vectorizer = pickle.load(count_file)
with open("models/vectorizer_matrix.pkl", "rb") as matrix_file:
    matrix = pickle.load(matrix_file)

In [None]:
# Saving progress
with open("models/vectorizer.pkl", "wb") as count_file:
    pickle.dump(vectorizer, count_file)
with open("models/vectorizer_matrix.pkl", "wb") as matrix_file:
    pickle.dump(matrix, matrix_file)

In [None]:
# Training the LDA model
lda_model = LatentDirichletAllocation(n_topics=50, max_iter=500, verbose=3, n_jobs=-1, learning_method="online")
lda_model.fit(matrix)

In [None]:
# Saving progress
with open("models/lda.50.pkl", "wb") as lda_file:
    pickle.dump(lda_model, lda_file)

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" | ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        print()
        print()
    print()

In [None]:
print_top_words(lda_model, feature_names, 50)