# Analyzing Dynamic Front '18 Campaign 

Historical data that was crawled over the months of February to March 2018 by IST Pulse was utilized in this notebook to generate a topic model and draw themes in the english language. We narrowed our focus on English data for the purposes of initial analysis and pointers for conversation.

This notebook desires to discover conversations around mentions of US Forces and like media syndicates. The `nats_data_query.py` program was utilized as a wrapper to quickly query the data. The `es_data_processor.py` program was used to extract the fields from the JSON formatted data that are most necessary for linguistic and time series analyses. The `tweet_processor.py` program was utilized to preprocess the text data in preparation for the topic modeling task. The latest version separates hashtags into terms (best guess).

The Python package `gensim` was used to perform the Latent Dirichlet Allocation algorithm. A single core LDA model was used, in order to allow for guaranteed reproducibility. This is much slower than utilizing gensim's multi-core option but is only worthwhile if reproducibility is necessary.

This analysis was re-processed to provide the ability to save/load models and data associated with each part of the process.

## Query Data from Elasticsearch (es)

In [36]:
from tf_data_query import TweetGatherer

In [37]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [38]:
tf = TweetGatherer()

### User enters query of interest in `q_s`

In [None]:
#query_string = 'meta.rule_matcher.results.rule_tag: UNIT-SM AND (doc.quoted_status.user.screen_name: (1stAirCavBDE  OR 2dCavalryRegt  OR 7thATC  OR DaggerBDE  OR Eucom  OR hqarrc OR NATO OR USArmy OR USArmyEurope )  OR doc.in_reply_to_screen_name: (1stAirCavBDE  OR 2dCavalryRegt  OR 7thATC  OR DaggerBDE  OR Eucom  OR hqarrc OR NATO OR USArmy OR USArmyEurope ))' + lang


        

In [39]:
import json
from elasticsearch import Elasticsearch, helpers

In [40]:
urlstring='http://tellfinder-elasticsearch.uncharted.software:9200/'
index = 'mx_ht_documents_2_sift_6.1'

In [41]:
es=Elasticsearch(urlstring, verify_certs=False, timeout=500)

In [48]:
query = {
  "size" : 0,
"query" : {
"bool" : {
"filter" : [ {
"term" : {
"feature.keywords.classifier" : "young"
}
}]
}
},
"aggs" : {
"locations" : {
"terms": {
"field" : "locality.label",
"size" : 10
}
}
}
}

In [46]:
query = {"size" : 0,
"query" : {
"bool" : {
"filter" : [ {
"term" : {
"feature.keywords.classifier" : "young"
}
}]
}
},
"aggs" : {
"locations" : {
"terms": {
"field" : "locality.label",
"size" : 10
}
}
}
}

In [51]:
num = es.count(index,body = query)



RequestError: TransportError(400, 'parsing_exception', 'request does not support [size]')

In [22]:
query = '"size": 0,"aggs" : {"sources": {"terms": {"field":"feature.site_name","size": 0}}}'


### Print number of tweets in the English language, from February 16th - March 19th 

In [23]:
number_of_docs = es.count(index='mx_ht_documents_2_sift_6.1', body=query)['count']

NameError: name 'es' is not defined

In [17]:
#print(nats.get_n_items(begin='2018-02-16', end='2018-03-19', lang=None))
print(tf.get_n_items('2018-02-15', '2018-03-19'))



RequestError: TransportError(400, 'parsing_exception', 'request does not support [size]')

In [None]:
#Estimated time of processing ~ 5 mins  

In [None]:
en_unit_data = nats.get_data(begin='2018-02-15', end='2018-03-19',query_str=q_s, lang='en') 


In [None]:
en_unit_data[0]["_source"]

## Extract Necessary Fields

In [None]:
from es_data_processor import ESDataProcessor

In [None]:
esdp = ESDataProcessor(en_unit_data)

In [None]:
df = esdp.format_df()

In [None]:
df.head()

## Clean Text Data

In [None]:
from tweet_processor import TweetProcessor

In [None]:
tp = TweetProcessor()

In [None]:
texts = list(df.text)
cleaned_texts = []
for t in texts:
    cleaned_text = tp.clean_text(t)
    cleaned_texts.append(cleaned_text)

In [None]:
cleaned_texts[0]

In [None]:
sparse = tp.make_sparse(texts=cleaned_texts)

In [None]:
vecs = [tp.stem_text(word_list=text) for text in sparse]

In [None]:
strings = [tp.re_string(text_list=text).strip() for text in vecs]

In [None]:
strings[0]

In [None]:
#append the preprocessed text as a column to the dataframe to keep track of original tweets
df['final_string'] = strings

In [None]:
df.head(n=5)

## Topic Modeling Analysis

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
!pip install numpy==1.14.0

In [None]:
df.to_csv('~/repos/nats/082016_espull.csv')

In [None]:
import pandas as pd
df=pd.read_csv('082016_espull.csv')

In [None]:
print(len(corpus))
print(len(dictionary))

In [None]:
from gensim import corpora

dictionary = corpora.Dictionary(vecs)

In [None]:
corpus = [dictionary.doc2bow(item) for item in vecs]

In [None]:
#Save + pickle
dictionary.save('~/repos/nats/032018.dict')
corpora.MmCorpus.serialize('~/repos/nats/032018.mm', corpus)

In [None]:
def evaluate_graph(dictionary, corpus, texts, limit):
    """
    Function to display num_topics - LDA graph using c_v coherence
    
    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    limit : topic limit
    
    Returns:
    -------
    lm_list : List of LDA topic models
    c_v : Coherence values corresponding to the LDA model with respective number of topics
    """
    c_v = []
    lm_list = []
    for num_topics in range(1, limit):
        lm = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
        lm_list.append(lm)
        cm = CoherenceModel(model=lm, texts=texts, dictionary=dictionary, coherence='c_v')
        c_v.append(cm.get_coherence())
        
    # Show graph
    x = range(1, limit)
    plt.plot(x, c_v)
    plt.xlabel("num_topics")
    plt.ylabel("Coherence score")
    plt.legend(("c_v"), loc='best')
    plt.show()
    
    return lm_list, c_v

In [None]:
!pip install matplotlib

In [None]:
import pyLDAvis.gensim
import matplotlib.pyplot as plt
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel

In [None]:
## Runs for about 20 mins

In [None]:
%timeit lmlist, c_v = evaluate_graph(dictionary=dictionary, corpus=corpus, texts=vecs, limit=10)

In [None]:
import numpy
v1 = numpy.asarray([0., 2.], dtype='f')
v2 = numpy.asarray([0., 1.], dtype='f')
print(numpy.dot(v1, v2))

In [None]:
"""
!pip show numpy

Display 

"""

In [None]:
import gensim.models.ldamodel 
#ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=3, id2word = dictionary, passes=20)
model = gensim.models.ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=150, iterations=50, alpha='asymmetric')

In [None]:
model.save('032018lda.model')

In [None]:
for i in range(0, model.num_topics):
    print(str(i),':',model.print_topic(i))

## Get Top Topic for Each Tweet

In future it would probably be best to have it return the list of topics with their respective adherences for each tweet; for now it is just the topic most adherent to each tweet.

In [None]:
#assign topics to tweets
doc_top_scores = []
for i in range(len(cleaned_texts)):
    doc_top_scores.append(model.get_document_topics(bow=dictionary.doc2bow(cleaned_texts[i])))

In [None]:
topic_scores = {}

for i in range(len(doc_top_scores)):
    topic_scores[i] = {}
    topics = [topic[0] for topic in doc_top_scores[i]]
    scores = [topic[1] for topic in doc_top_scores[i]]
    for topic_n in range(500):
        
        if topic_n in topics:
            topic_scores[i][topic_n] = scores[topics.index(topic_n)]

In [None]:
import pandas as pd

top_Score_df = pd.DataFrame.from_dict(topic_scores)
top_Score_df = top_Score_df.fillna(0)
top_Score_df = top_Score_df.transpose()

In [None]:
top_Score_df['text'] = list(df.text)
top_Score_df['processed_text'] = list(strings)

In [None]:
import numpy as np

In [None]:
maxes = [] 
for row in range(top_Score_df.shape[0]):
    topic_adherence = list(top_Score_df.iloc[row,:top_Score_df.shape[1] - 2])
    max_score = topic_adherence.index(np.max(topic_adherence))
    maxes.append(max_score)

In [None]:
top_Score_df['max_topic'] = maxes
df['max_topic'] = maxes

In [None]:
top_Score_df.head()

In [None]:
print(df.groupby('max_topic').count().sort_index(by=['text'],ascending=False)['text'].loc[[143]].sum())

## Display Top 20 Tweets Per Topic

In [None]:
#remove duplicates so you get the most out of the top 20 tweets
df_no_dups = df.drop_duplicates(subset='final_string')
print(df_no_dups.shape)

In [None]:
from IPython.display import display
from ipywidgets import widgets
from IPython.display import clear_output

text = widgets.Text()
display(text)

def handle_submit(sender):
    clear_output()
    print('Showing top 20 tweets in Topic',text.value)
    try:
        for t in df_no_dups.loc[df_no_dups.max_topic == int(text.value)].sample(frac=1)['text'][:70]:
            print(t)
            print()
    except KeyError:
        print('Invalid Topic Number (try anything from 0 to 199).')
    
text.on_submit(handle_submit)

In [None]:
df_no_dups[df_no_dups.text.str.contains('maternal')][['text', 'max_topic']]

In [None]:
dictionary = corpora.Dictionary.load('~/repos/nats/032018.dict')
corpus = corpora.MmCorpus('~/repos/nats/032018.mm')
lda = LdaModel.load('032018lda.model')
#print dictionary
#print corpus
#print lda

In [None]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

In [None]:
pyLDAvis.gensim.prepare(lda, corpus, dictionary,mds='mmds')

In [None]:
#### moving manually to data folder
import re

save_text = list(kenya_geo_df.text)
save_text = [re.sub('\\n|\n|,|\s|\t', ' ', str(save_text[i])) for i in range(len(save_text))]
kenya_geo_df.text = save_text


kenya_geo_df.to_csv('~/repos/validate/data/model_persist/month01/August 2016.csv')
top_Score_df.to_csv('~/repos/validate/data/model_persist/month01/august2016_extended.csv')

In [None]:
import pandas as pd

kenya_geo_df = pd.read_csv('~/repos/validate/data/model_persist/month01/August 2016.csv', encoding='iso-8859-1')

In [None]:
kenya_geo_df[kenya_geo_df.text.str.contains('health')][['text', 'max_topic']]

In [None]:
[text for text in kenya_geo_df['text'] if 'health' in text.lower() ]

In [None]:
[i for i in kenya_geo_df.loc[i,'text'] if 'health' in i.lower()]

In [None]:
# remove duplicates so you get the most out of the top 20 tweets
# kenya_tweet_df_no_dups = top_Score_df.drop_duplicates(subset='processed_text')
lda_save_path = "./saved-lda-model"
ldaModel.save(lda_save_path) 

#moving manually to data folder
kenya_geo_df.to_csv('kenya_data_full_all.csv', encoding='utf-8')  

In [None]:
oup = open("topic_summary.txt", "wb")
for x in topics_final:
    oup.write("%s\n" % (x))
oup.close()

sc.stop()

In [None]:
#Free up some memory 
clear()