# LDA Model Generation (Building Blocks of Main Function)

In [1]:
def load_data(file):
    """Loads a file for reading with json, and returns the open file."""
    with open (file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

def write_data(file, data):
    """Takes in a file and data to write, and writes the data onto the 
    file with json."""
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)


In [2]:
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "AV"]):
    """Taking in a list of article texts, lemmatizes the words and returns 
    a new text list of all lemmatized text."""
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    text_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        text_out.append(final)
    return text_out


In [3]:
def preprocess_article(input_text):
    """
    Taking in an article text, splits the text by words and removes all stopwords
    (from a stopwords file) and also removes leading and trailing whitespace.
    Returns the cleaned text.
    """
    sentences_ted = []

    # Use regular expression to split the text into words
    sentences_ted = re.findall(r'\b\w+\b', input_text)
    sentences_ted = [token for token in sentences_ted if not token.isdigit()]


    # Load stopwords from a file into a set
    stoplist = set()
    with open('stopwords.txt') as openfileobject: 
        for line in openfileobject:
            # Use strip() to remove leading/trailing whitespace
            stoplist.add(line.strip())

    cleaned_text = " ".join(word for word in sentences_ted if word not in stoplist)

    return cleaned_text


In [4]:
def gen_words(texts):
    """
    Taking in a list of text, preprocesses and returns 
    the text as tokenized words.
    """
    final = [gensim.utils.simple_preprocess(text, deacc=True) for text in texts]

    return final


In [5]:
def compute_coherence_values(dictionary, corpus, tokenizedData, limit, start=2, step=3):
    """
    Takes in our word dictionary, the article corpus, the tokenized words, 
    and three settings limit (max topics), start (min topics), and
    step (iteration value). This creates a list of LDA models with varying
    numbers of topics which start at start and end at limit, incrementing by step.
    It also creates a matching list of those models' coherence values, which
    correspond to how well the model describes our data.
    Returns the list of models and their corresponding coherence values.
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus, 
                                                num_topics=num_topics, 
                                                id2word=dictionary, 
                                                passes=10)

        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, 
                                        dictionary=dictionary, 
                                        texts=tokenizedData, 
                                        coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values


# LDA Model Generation (Main Function)

In [6]:
def create_lda_model(topic_limit, topic_start, topic_step):
    """
    Takes in the generated sentiment analysis dataframe, and the settings 
    topic_limit (max), topic_start (min), and topic_step(increment). This walks
    through the entire process of creating a list of LDA_models and returns the 
    model with the highest coherence, as well as generating a graph showing which
    number of topics had what coherence score.

    First we gather all of the article texts from the dataframe, lemmatize it and
    remove all stopwords. We tokenize each text and use the tokenized text to
    generate a word dictionary. We make the corpus for our articles, and then
    we generate our LDA models.

    It creates a list of LDA models and their corresponding coherence values
    based on the settings we input. It generates a visual of all models and their
    coherence scores based on the number of topics they had. Finally, returns 
    the most coherent LDA model and the text corpus.
    """

    # Pre-Process text grabbing
    raw_data = []
    # Get all texts in this data structure
    for index, row in df.iterrows():
        raw_data.append(row['Text'])

    # Lemmatize the texts
    lemmatized_data = lemmatization(raw_data)

    # Removing Stop Words
    filtered_data = [preprocess_article(x) for x in lemmatized_data]

    # Tokenize the text
    tokenized_data = gen_words(filtered_data)

    # Create text dictionary
    id2word = corpora.Dictionary(tokenized_data)
    id2word.filter_extremes(no_below=0.1, no_above=0.9)

    # Create corpus
    corpus = [id2word.doc2bow(text) for text in tokenized_data]

    # Topic modeling using input values
    model_list, coherence_values = compute_coherence_values(dictionary=id2word,
                                                            corpus=corpus,
                                                            tokenizedData=tokenized_data,
                                                            limit=topic_limit,
                                                            start=topic_start,
                                                            step=topic_step)

    #Coherence score visualization
    x = range(topic_start, topic_limit, topic_step)
    plt.plot(x, coherence_values)
    plt.xlabel("Num Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc='best')
    plt.show()

    # Find the model with max coherence
    max_coherence_index = coherence_values.index(max(coherence_values))
    lda_model = model_list[max_coherence_index]

    return lda_model, corpus # This is our LDA model object that we will work with
