- Sentiment analysis around the top bigram/trigram related to tools referenced on Twitter. 
	- Topics within positive vs. negative, 2020 vs. 2021
	- Topics models for different tools, 2020 vs. 2021

- Removing search words
	- Remove bottom N% - justify based on past research
	- Check where the search terms land in terms of relative rankings for where search terms occur. 

## Packages and Modules

In [None]:
# For data manipulation
import pandas as pd
import numpy as np
import re

# For data visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from pprint import pprint

# For topic modeling
import spacy
from spacy.lang.en import English
parser = English()
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet as wn
import gensim
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
import pyLDAvis
import pyLDAvis.gensim_models
import pickle

## Functions

In [None]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('TWITTER_HANDLE')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

def model_check(df_corpus, df_dict, num_topics):
    # Build model
    lda_model = gensim.models.ldamodel.LdaModel(corpus=df_corpus, 
                                                id2word=df_dict, 
                                                num_topics=num_topics, 
                                                random_state=100,
                                                update_every=1, 
                                                chunksize=1000, 
                                                passes=10, 
                                                alpha='auto', 
                                                per_word_topics=True)
    # Compute complexity and coherence score
    complexity = lda_model.log_perplexity(data)
    coherence_model_lda = CoherenceModel(model=lda_model, 
                                         texts=data, 
                                         dictionary=dictionary, 
                                         coherence='c_v')
    coherence = coherence_model_lda.get_coherence()
    return complexity, coherence

def topic_check(df, text_col):
    # Create specific corpus from passed df
    df_text = []
    for i in range(len(df_text)):
        tokens = prepare_text_for_lda(df_text[text_col][i])
        df_text.append(tokens)

    df_dict = corpora.Dictionary(df_text)
    df_corpus = [dictionary.doc2bow(text) for text in df_text]

    # Create model and save
    topics = []
    complexity = []
    coherence = []
    for i in range(1, 20):
        cmplx, coh = model_check(df_corpus=df_corpus,
                                 df_dict=df_dict, 
                                 num_topics=i)
        topics.append(i)
        complexity.append(cmplx)
        coherence.append(coh)

    # Save as a df
    df_df = pd.DataFrame({'topics': topics, 
                          'complexity':complexity, 
                          'coherence':coherence})
    return df_df

## Read in data

In [None]:
# Import data
raw_data = pd.read_csv('../data/03_primary/all_data.csv')
data = raw_data.copy()
data.head()

## Text Cleaning

In [None]:
# Clean up data for topic modeling
import random
text_data = []
for i in range(len(data)):
    tokens = prepare_text_for_lda(data['content'][i])
    text_data.append(tokens)
    if random.random()>0.95:
        print(tokens)

In [None]:
# Create dataframe with the cols we need
df = pd.DataFrame({'tweet':text_data, 
                   'sentiment':data['vader_com'], 
                   'year':data['year']})
df.head()

## Wordclouds

In [None]:
# Make sure the cols are numeric
df['sentiment'] = df['sentiment'].astype(float)
df['year'] = df['year'].astype(int)

In [None]:
# Separate positive vs. negative, 2020 vs. 2021
pos_20 = df[(df['year']==2020) & (df['sentiment']>=0.5)].reset_index(drop=True)
pos_21 = df[(df['year']==2021) & (df['sentiment']>=0.5)].reset_index(drop=True)
neg_20 = df[(df['year']==2020) & (df['sentiment']<=-0.5)].reset_index(drop=True)
neg_21 = df[(df['year']==2021) & (df['sentiment']<=-0.5)].reset_index(drop=True)

In [None]:
# Import wordcloud library
from wordcloud import WordCloud

# Join tweets together
l_str_p20 = ','.join(list(pos_20['tweet'].values))
l_str_p21 = ','.join(list(pos_21['tweet'].values))
l_str_n20 = ','.join(list(neg_20['tweet'].values))
l_str_n21 = ','.join(list(neg_21['tweet'].values))

# Create wordcloud objects
for item in [l_str_p20, l_str_p21, l_str_n20, l_str_n21]:
    wordcloud = WordCloud(background_color='white', max_words=2000, 
                      contour_width=3, contour_color='steelblue')
    wordcloud.generate(item)
    wordcloud.to_image()

In [None]:
!pip install wordcloud

## LDA with Gensim

In [None]:
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('../data/04_intermediate/dictionary.gensim')

In [None]:
# Entire corpus
topics = []
complexity = []
coherence = []
for i in range(2, 20):
    cmplx, coh = model_check(data=corpus, num_topics=i)
    topics.append(i)
    complexity.append(cmplx)
    coherence.append(coh)

In [None]:
# Save as a df
corpus_topics = pd.DataFrame({'topics': topics, 
                             'complexity':complexity, 
                             'coherence':coherence})
corpus_topics

In [None]:
# Positive Tweets from 2020
