# Data Exploration

## Load imports.

In [8]:
from gensim.models import KeyedVectors
import gensim.downloader as api

# Make common scripts visible
import sys
sys.path.append('../common/')

from tweet_parser import load_data, create_topic_hashtags_dict, cleanup_tweets
from sentence_utils import remove_stop_words_and_lemmatize
from function_executor import apply_fn_to_list_items_in_dict

## Load the data.

In [9]:
tweets_keyed_by_topic = load_data(20, 'data/')
# Get the hashtags that were used to find the tweets (i.e. the topic indicators)
topic_hashtags_dict = create_topic_hashtags_dict('hashtag_list/')
tweets_keyed_by_topic_cleaned = cleanup_tweets(tweets_keyed_by_topic, topic_hashtags_dict)

## View subset of the tweets for each topic

View uncleaned and cleaned versions.  Cleaned means:
* Removed hashtags that were used to find the tweets as these are topic tags.
* Remove any URLs present in the tweet as these add no value unless they are resolved and summarised.
* Remove any sequence of hashtags at the end of the tweet as these are effectively topic tags.
* For remaining hashtags, remove hash and split into words when camel case or separated by underscores.

In [10]:
num_tweets_to_print = 20

def print_tweets(tweets_keyed_by_topic):
    for topic, tweets in tweets_keyed_by_topic.items():
        print('-------------------- {} --------------------'.format(topic))
        print('')
        for tweet in tweets[:num_tweets_to_print]:
            print('---')
            print(tweet)
            print('---')
        print('') 

print_tweets(tweets_keyed_by_topic)
print_tweets(tweets_keyed_by_topic_cleaned)

-------------------- brexit --------------------

---
Brexit may break Britain’s Tory party https://t.co/MZYrnYb8gA
---
---
Turkey want a religious constitution?!An even better reason to stay in #EU #Brexit #EUref #VoteLeave #LeaveEU  https://t.co/Ieu2J0hfdS
---
---
@Stronger_ln @jeremeycorbyn Don't want Brexit but patronising,insulting campaign ad like that is nuf 2 change my mind if their likes run EU.
---
---
Very odd discussion with an academic who was worried #Brexit would lead to the introduction of student fees. Hello? https://t.co/rQQ0LxWnfL
---
---
"#BREXIT COULD TRIGGER THE COLLAPSE OF FRAGILE #EU" REALLY I SAY GOOD &amp; THAT IS NOT MY RESPONSIBILITY, ALL I WANT IS 4 THE UK TO #voteleave
---
---
Is losing the City of London really a price worth paying for Brexit? | via Telegraph Business https://t.co/VFxpV3srKe
---
---
#Brexit could put UK's #energy links to Europe at risk https://t.co/6hU7yQik7H
---
---
BBC News - EU referendum: Donald Trump backs Brexit https://t.co/09TsNp

## Test Removal of Punctuation

Test the created method that converts a tweet string to a string where punctuation is removed works correctly as Keras will tokenise on spaces only.

In [14]:
tweets_keyed_by_topic_cleaned = apply_fn_to_list_items_in_dict(tweets_keyed_by_topic_cleaned,
                                                               remove_stop_words_and_lemmatize,
                                                               lowercase = False,
                                                               lemmatize = False)

print_tweets(tweets_keyed_by_topic_cleaned)

-------------------- brexit --------------------

---
may break britain tory party
---
---
turkey want religious constitution even better reason stay
---
---
stronger_ln jeremeycorbyn want patronising insulting campaign ad like nuf 2 change mind like run eu
---
---
odd discussion academic worried would lead introduction student fee hello
---
---
could trigger collapse fragile eu really say good amp responsibility want 4 uk
---
---
losing city london really price worth paying via telegraph business
---
---
could put uk energy link europe risk
---
---
bbc news eu referendum donald trump back sure support leave camp want
---
---
.anand menon writes pse could mean uk public agency
---
---
donald j. trump oi sausage finger keep tiny grubby mitt british affair focus ballbag instead
---
---
japan economy tanking uk listen shinzo abe via telegraph business
---
---
donald trump back say uk better without eu – happened u news guardian
---
---
donald trump opines british politics back
---
---
tel

## Case Sensitivity of Pre-Trained Word Vectors

In order to decide whether to lower case all words, we need to examine the pre-trained word embeddings to determine if they were trained on a corpus that was lower cased or not. 

In [5]:
# Test word2vec
word2vec_model = KeyedVectors.load_word2vec_format('embeddings/GoogleNews-vectors-negative300.bin.gz', binary=True)

KeyboardInterrupt: 

In [None]:
word2vec_model['Donald_Trump']
word2vec_model['European_Union']
word2vec_model['Lindsey_Vonn']
word2vec_model['ski_racer']

# American Climatologist, good indicator of global warming tweet
try:
    word2vec_model['Scott_Denning']
except:
    print('While more prominent people occur in the news, those lesser known do not.')

In [None]:
# Test GloVe
glove_model = api.load("glove-wiki-gigaword-300")

In [None]:
glove_model['olympics']

try:
    glove_model['Donald_Trump']
except:
    print('Glove supports only single lower case words')

try:
    glove_model['Brexit']
except:
    print('Glove supports only single lower case words')