In [None]:
import sqlite3

In [None]:
#@title Setup (click the "run" button to the left) {display-mode: "form"}

## Setup ##

# imports

# built-in Python libraries
# -------------------------
import collections
import re
import string
import warnings
warnings.filterwarnings('ignore')

# 3rd party libraries
# -------------------

# Natural Language Toolkit (https://www.nltk.org/)
import nltk

# download punctuation related NLTK functions
# (needed for sent_tokenize())
nltk.download('punkt')
# download NLKT part-of-speech tagger
# (needed for pos_tag())
nltk.download('averaged_perceptron_tagger')
# download wordnet
# (needed for lemmatization)
nltk.download('wordnet')
# download stopword lists
# (needed for stopword removal)
nltk.download('stopwords')
# dictionary of English words
nltk.download('words')

# numpy: matrix library for Python
import numpy as np

# Gensim for topic modeling
import gensim
# for loading data
import sklearn.datasets
# for LDA visualization
!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.gensim

# for uploading data files
from google.colab import files

# downloading values lexicon
!wget https://raw.githubusercontent.com/steve-wilson/values_lexicon/master/lexicon_1_0/values_lexicon.txt
!wget https://raw.githubusercontent.com/steve-wilson/values_lexicon/master/sample_data/subreddits/christian_500.txt
!wget https://raw.githubusercontent.com/steve-wilson/values_lexicon/master/sample_data/subreddits/business_500.txt
!wget https://raw.githubusercontent.com/steve-wilson/values_lexicon/master/sample_data/subreddits/college_500.txt

def text_to_lemma_frequencies(text, remove_stop_words=True):
    
    # split document into sentences
    sentences = nltk.sent_tokenize(text)
    
    # create a place to store (word, pos_tag) tuples
    words_and_pos_tags = []
    
    # get all words and pos tags
    for sentence in sentences:
        words_and_pos_tags += nltk.pos_tag(nltk.word_tokenize(sentence))
        
    # load the lemmatizer
    lemmatizer = nltk.stem.WordNetLemmatizer()
    
    # lemmatize the words
    lemmas = [lemmatizer.lemmatize(word,lookup_pos(pos)) for \
              (word,pos) in words_and_pos_tags]
    
    # convert to lowercase
    lowercase_lemmas = [lemma.lower() for lemma in lemmas]
    
    # load the stopword list for English
    stop_words = set([])
    if remove_stop_words:
        stop_words = set(nltk.corpus.stopwords.words('english'))
    
    # add punctuation to the set of things to remove
    all_removal_tokens = stop_words | set(string.punctuation)
    
    # bonus: also add some custom double-quote tokens to this set
    all_removal_tokens |= set(["''","``"])
    
    # only get lemmas that aren't in these lists
    content_lemmas = [lemma for lemma in lowercase_lemmas \
                      if lemma not in all_removal_tokens and \
                      re.match(r"^\w+$",lemma)]
    
    # return the frequency distribution object
    return nltk.probability.FreqDist(content_lemmas)
    
def docs2matrix(document_list):
    
    # use the vocab2index idea from before
    vocab2index = {}
    
    # load the stopword list for English
    stop_words = set(nltk.corpus.stopwords.words('english'))
    stop_words |= set(['from', 'subject', 're', 'edu', 'use'])
    
    # add punctuation to the set of things to remove
    all_removal_tokens = stop_words | set(string.punctuation)
    
    # bonus: also add some custom double-quote tokens to this set
    all_removal_tokens |= set(["''","``"])
    
    vocab2index = {}
    latest_index = 0

    lfs = []
    # this should be a nice starting point
    for doc in document_list:
        lf = text_to_lemma_frequencies(doc,all_removal_tokens)
        for token in lf.keys():
            if token not in vocab2index:
                vocab2index[token] = latest_index
                latest_index += 1
                
        lfs.append(lf)
    
    # create the zeros matrix
    corpus_matrix = np.zeros((len(lfs), len(vocab2index)))
    
    for row, lf in enumerate(lfs):
        for token, frequency in lf.items():
            column = vocab2index[token]
            corpus_matrix[row][column] = frequency
    
    return corpus_matrix, vocab2index

    
# Lemmatization -- redefining this here to make
# code block more self-contained
def lookup_pos(pos):
    pos_first_char = pos[0].lower()
    if pos_first_char in 'nv':
        return pos_first_char
    else:
        return 'n'


            
print()
print("Done with setup!")
print("If you'd like, you can click the (X) button to the left to clear this output.")

In [None]:
#liars_file = open("content\Liars7_frac20200730(1).sqlite")
con = sqlite3.connect("/content/Liars7_frac20200730(1)(1).sqlite")
cur = con.cursor()
cur.execute('SELECT * FROM Reviews')
rows = cur.fetchall()
liars_file = []
for row in rows: 
  liars_file.append(row[6])

# using the function we wrote before, but modified to also return the vocab2index
corpus_matrix, word2id = docs2matrix(liars_file)
# reverse this dictionary
id2word = {v:k for k,v in word2id.items()}

# Dense2Corpus expects that each 
corpus = gensim.matutils.Dense2Corpus(corpus_matrix, documents_columns=False)
print("Loaded",len(corpus),"documents into a Gensim corpus.")

Loaded 793 documents into a Gensim corpus.


In [None]:
warnings.filterwarnings('ignore')

# run LDA on our corpus, using out dictionary (k=6)
lda = gensim.models.LdaModel(corpus, id2word=id2word, num_topics=3)
lda.print_topics()

[(0,
  '0.050*"app" + 0.036*"neck" + 0.031*"exercise" + 0.028*"stretch" + 0.023*"would" + 0.016*"help" + 0.013*"think" + 0.013*"like" + 0.013*"day" + 0.013*"use"'),
 (1,
  '0.060*"app" + 0.048*"neck" + 0.029*"stretch" + 0.021*"exercise" + 0.019*"work" + 0.017*"day" + 0.015*"use" + 0.015*"help" + 0.013*"feel" + 0.011*"like"'),
 (2,
  '0.039*"app" + 0.035*"neck" + 0.034*"stretch" + 0.022*"use" + 0.016*"help" + 0.014*"would" + 0.013*"day" + 0.012*"feel" + 0.010*"way" + 0.009*"work"')]

In [None]:
#noise cancellation 
total_counts = np.sum(corpus_matrix, axis=0)
sorted_words = sorted( zip( range(len(total_counts)) ,total_counts), \
                       key=lambda x:x[1], reverse=True )
print(sorted_words)
N = 50
M = 2
top_N_ids = [item[0] for item in sorted_words[:N]]
appears_less_than_M_times = [item[0] for item in sorted_words if item[1] < M]
vocab_dense = [id2word[idx] for idx in range(len(id2word))]

print("Top words to remove:", ' '.join([id2word[idx] for idx in top_N_ids]))

remove_indexes = top_N_ids+appears_less_than_M_times
corpus_matrix_filtered = np.delete(corpus_matrix,remove_indexes,1)

for index in sorted(remove_indexes, reverse=True):
    del vocab_dense[index]

id2word_filtered = {}
word2id_filtered = {}

for i,word in enumerate(vocab_dense):
    id2word_filtered[i] = word
    word2id_filtered[word] = i
    
corpus_filtered = gensim.matutils.Dense2Corpus(corpus_matrix_filtered, documents_columns=False)

print("Original matrix shape:",corpus_matrix.shape)
print("New matrix shape:",corpus_matrix_filtered.shape)
#trying the lda again
lda = gensim.models.LdaModel(corpus_filtered, id2word=id2word_filtered, num_topics=5)
lda.print_topics()
#didn't help much


[(0, 1515.0), (63, 1189.0), (62, 846.0), (46, 635.0), (74, 447.0), (81, 442.0), (32, 425.0), (161, 415.0), (128, 377.0), (151, 333.0), (41, 320.0), (113, 281.0), (153, 266.0), (61, 253.0), (48, 246.0), (135, 232.0), (29, 224.0), (95, 223.0), (155, 195.0), (157, 190.0), (8, 185.0), (72, 184.0), (180, 180.0), (256, 169.0), (47, 167.0), (37, 166.0), (171, 157.0), (125, 144.0), (332, 140.0), (346, 139.0), (116, 138.0), (94, 134.0), (137, 132.0), (149, 131.0), (273, 131.0), (126, 127.0), (25, 122.0), (322, 120.0), (364, 119.0), (377, 119.0), (102, 117.0), (188, 113.0), (134, 109.0), (217, 105.0), (304, 104.0), (15, 103.0), (286, 102.0), (96, 100.0), (88, 94.0), (147, 93.0), (198, 93.0), (71, 91.0), (82, 88.0), (152, 87.0), (145, 86.0), (260, 86.0), (170, 85.0), (202, 85.0), (764, 83.0), (14, 81.0), (163, 81.0), (187, 79.0), (50, 78.0), (146, 77.0), (185, 76.0), (123, 75.0), (86, 74.0), (641, 71.0), (578, 70.0), (398, 68.0), (84, 66.0), (319, 66.0), (331, 66.0), (103, 65.0), (190, 63.0), (42

[(0,
  '0.011*"tension" + 0.009*"overall" + 0.009*"one" + 0.008*"definitely" + 0.008*"see" + 0.008*"reminder" + 0.007*"every" + 0.007*"stretching" + 0.007*"well" + 0.006*"different"'),
 (1,
  '0.020*"people" + 0.012*"font" + 0.011*"math" + 0.010*"overall" + 0.008*"want" + 0.007*"something" + 0.007*"pretty" + 0.007*"anyone" + 0.006*"stretching" + 0.006*"basic"'),
 (2,
  '0.013*"thing" + 0.012*"different" + 0.012*"focus" + 0.012*"math" + 0.011*"throughout" + 0.008*"nice" + 0.008*"anyone" + 0.008*"reminder" + 0.008*"even" + 0.007*"computer"'),
 (3,
  '0.011*"go" + 0.010*"show" + 0.010*"remind" + 0.008*"relieve" + 0.008*"workday" + 0.008*"start" + 0.008*"bit" + 0.007*"throughout" + 0.007*"computer" + 0.007*"head"'),
 (4,
  '0.011*"definitely" + 0.009*"tension" + 0.008*"know" + 0.008*"tell" + 0.008*"even" + 0.008*"muscle" + 0.008*"go" + 0.008*"highly" + 0.008*"head" + 0.007*"see"')]

In [None]:
con = sqlite3.connect("/content/Liars7_frac20200730(1)(1).sqlite")
cur = con.cursor()
cur.execute('SELECT * FROM Reviews')
rows = cur.fetchall()
liars_file = []
for row in rows: 
  sentences = nltk.sent_tokenize(row[6])
  for sentence in sentences: 
    liars_file.append(sentence)

# using the function we wrote before, but modified to also return the vocab2index
corpus_matrix, word2id = docs2matrix(liars_file)
# reverse this dictionary
id2word = {v:k for k,v in word2id.items()}

# Dense2Corpus expects that each 
corpus = gensim.matutils.Dense2Corpus(corpus_matrix, documents_columns=False)
print("Loaded",len(corpus),"documents into a Gensim corpus.")

warnings.filterwarnings('ignore')

# run LDA on our corpus, using out dictionary (k=6)
lda = gensim.models.LdaModel(corpus, id2word=id2word, num_topics=3)
lda.print_topics()

Loaded 4213 documents into a Gensim corpus.


[(0,
  '0.068*"app" + 0.053*"neck" + 0.029*"day" + 0.027*"stretch" + 0.027*"help" + 0.026*"work" + 0.021*"recommend" + 0.019*"exercise" + 0.019*"would" + 0.016*"desk"'),
 (1,
  '0.049*"app" + 0.030*"feel" + 0.028*"exercise" + 0.026*"really" + 0.025*"use" + 0.024*"easy" + 0.015*"work" + 0.014*"stretch" + 0.012*"simple" + 0.012*"take"'),
 (2,
  '0.063*"neck" + 0.054*"app" + 0.037*"use" + 0.033*"stretch" + 0.026*"exercise" + 0.017*"get" + 0.017*"help" + 0.014*"really" + 0.014*"try" + 0.013*"give"')]