In [None]:
import sqlite3

import gensim.corpora.dictionary
import gensim.models.callbacks
import gensim.models
import gensim.models.ldamodel
import gensim.parsing.preprocessing
import pandas
import textblob

%matplotlib inline

In [None]:
conn = sqlite3.connect('./articles.db')

data_frame = pandas.read_sql(
    '''
        SELECT
            *
        FROM
            predictions
    ''',
    conn
)

In [None]:
data_frame.head(5)

# Topics

In [None]:
def fix_and_tokenize_text(text):
    return gensim.parsing.preprocessing.preprocess_string(text)

### Description LDA

In [None]:
descriptions = data_frame['description'].apply(lambda x: fix_and_tokenize_text(x)).tolist()
description_dictionary = gensim.corpora.dictionary.Dictionary(descriptions)
description_corpus = [description_dictionary.doc2bow(text) for text in descriptions]

In [None]:
lda_model_description = gensim.models.ldamodel.LdaModel(description_corpus, id2word=description_dictionary, num_topics=10)

In [None]:
lda_model_description.log_perplexity(description_corpus)

### Title LDA

In [None]:
titles = data_frame['title'].apply(fix_and_tokenize_text).tolist()
title_dictionary = gensim.corpora.dictionary.Dictionary(titles)
title_corpus = [description_dictionary.doc2bow(text) for text in titles]

In [None]:
lda_model_title = gensim.models.ldamodel.LdaModel(
    title_corpus,
    num_topics=10
)

In [None]:
lda_model_title.log_perplexity(title_corpus)

### Title Nouns LDA

In [None]:
def get_nouns(text):
    blob = textblob.TextBlob(text)
    return ' '.join(blob.noun_phrases)

data_frame['titleNouns'] = data_frame['title'].apply(get_nouns)

In [None]:
nouns = data_frame['titleNouns'].apply(fix_and_tokenize_text).tolist()
nouns_dictionary = gensim.corpora.dictionary.Dictionary(nouns)
nouns_corpus = [nouns_dictionary.doc2bow(text) for text in descriptions]

In [None]:
lda_model_noun = gensim.models.ldamodel.LdaModel(
    nouns_corpus,
    num_topics=20
)

In [None]:
lda_model_noun.log_perplexity(nouns_corpus)

### Title Nouns TFIDF

In [None]:
tfidf_model_title = gensim.models.TfidfModel(
    title_corpus
)

### Figure Topics

In [None]:
data_frame['descriptionBow'] = title_corpus

In [None]:
data_frame['topicProbs'] = data_frame['descriptionBow'].apply(lambda x: lda_model_description[x])

In [None]:
max_topics = data_frame['topicProbs'].apply(
    lambda probs: max(probs, key=lambda prob: prob[1]) if len(probs) > 0 else (-1, 0)
)

In [None]:
data_frame['maxTopic'] = max_topics.apply(lambda x: x[0])

In [None]:
data_frame['maxTopicProb'] = max_topics.apply(lambda x: x[1])

# Sentiment

In [None]:
def get_avg_sentiment(text):
    blob = textblob.TextBlob(text)
    sentences = blob.sentences
    if len(sentences) == 0:
        return 0
    return sum(map(lambda x: x.sentiment.polarity, sentences)) / len(sentences)

### Description

In [None]:
data_frame['descSentiment'] = data_frame['description'].apply(get_avg_sentiment)

In [None]:
data_frame['descSentiment'].hist()

In [None]:
data_frame['descSentiment'].std()

### Title

In [None]:
data_frame['titleSentiment'] = data_frame['title'].apply(get_avg_sentiment)

In [None]:
data_frame['titleSentiment'].hist()

In [None]:
data_frame['titleSentiment'].std()

# Save Out

In [None]:
data_frame.keys()

In [None]:
data_frame_out = pandas.DataFrame()

In [None]:
data_frame_out['title'] = data_frame['title']
data_frame_out['description'] = data_frame['description']
data_frame_out['actualSource'] = data_frame['actualSource']
data_frame_out['maxTopic'] = data_frame['maxTopic']
data_frame_out['maxTopicProb'] = data_frame['maxTopicProb']
data_frame_out['titleSentiment'] = data_frame['titleSentiment']

In [None]:
data_frame_out.to_sql('topics_and_sentiment', conn)

In [None]:
conn.commit()

# See Words

In [None]:
for (word_num, prob) in lda_model_description.get_topic_terms(0,40):
    print(nouns_dictionary[word_num], prob)