In [26]:
# Unicode Handling
from __future__ import unicode_literals # unused in this notebook
import codecs

import numpy as np
import gensim
import pandas as pd
import json

# spacy is used for pre-processing and traditional NLP
import spacy
from spacy.en import English

# Word2Vec - find words that are most similar to certain words
from gensim.models.word2vec import Word2Vec

# LDA - 
from gensim.models.ldamodel import LdaModel
from gensim.matutils import Sparse2Corpus # need to use for LDA

## Building a LDA model

In [17]:
# read in data

data = pd.read_csv("../dataset/stumbleupon.tsv", sep='\t')
data['title'] = data.boilerplate.map(lambda x: json.loads(x).get('title', ''))
data['body'] = data.boilerplate.map(lambda x: json.loads(x).get('body', ''))
data.head()


# do countvectorizer
from sklearn.feature_extraction.text import CountVectorizer   
cv = CountVectorizer(binary=False,   stop_words='english',  min_df=3)

docs = cv.fit_transform(data.body.dropna())
# Build a mapping of numerical ID to word
id2word = dict(enumerate(cv.get_feature_names()))

In [27]:
# First we convert our word-matrix into gensim's format 
corpus = Sparse2Corpus(docs, documents_columns = False)

# Then we fit an LDA model 
lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics= 15)

In [21]:
lda_model.show_topics()

[(3,
  u'0.009*"image" + 0.008*"small" + 0.008*"images" + 0.008*"2011" + 0.008*"link" + 0.007*"buzz" + 0.007*"chocolate" + 0.006*"campaign" + 0.006*"track" + 0.006*"jpg"'),
 (0,
  u'0.008*"2010" + 0.006*"2009" + 0.006*"technology" + 0.005*"new" + 0.005*"2008" + 0.004*"future" + 0.004*"2007" + 0.004*"2011" + 0.004*"like" + 0.004*"2006"'),
 (10,
  u'0.007*"food" + 0.007*"make" + 0.007*"recipe" + 0.006*"water" + 0.006*"salad" + 0.006*"like" + 0.005*"cheese" + 0.005*"eggs" + 0.005*"recipes" + 0.005*"bacon"'),
 (9,
  u'0.008*"apple" + 0.007*"like" + 0.006*"recipe" + 0.005*"just" + 0.004*"time" + 0.004*"make" + 0.004*"food" + 0.003*"google" + 0.003*"data" + 0.003*"recipes"'),
 (12,
  u'0.013*"dress" + 0.010*"clothing" + 0.009*"indie" + 0.006*"nav" + 0.006*"background" + 0.005*"00" + 0.005*"url" + 0.005*"dresses" + 0.005*"images" + 0.004*"google"'),
 (4,
  u'0.008*"health" + 0.007*"body" + 0.006*"people" + 0.004*"help" + 0.004*"weight" + 0.004*"time" + 0.004*"brain" + 0.003*"like" + 0.003*"fa

In [25]:
# characterizing what i want to see
# see 5 most important words in each topic

num_topics = 25
n_words_per_topic = 5 
for ti, topic in enumerate(lda_model.show_topics(num_topics = num_topics, num_words = n_words_per_topic)):
    print("Topic: %d" % (ti))
    print (topic)
    print ()

Topic: 0
(0, u'0.008*"2010" + 0.006*"2009" + 0.006*"technology" + 0.005*"new" + 0.005*"2008"')
()
Topic: 1
(1, u'0.016*"http" + 0.016*"com" + 0.012*"href" + 0.011*"www" + 0.006*"la"')
()
Topic: 2
(2, u'0.013*"cup" + 0.009*"minutes" + 0.009*"recipe" + 0.008*"add" + 0.008*"butter"')
()
Topic: 3
(3, u'0.009*"image" + 0.008*"small" + 0.008*"images" + 0.008*"2011" + 0.008*"link"')
()
Topic: 4
(4, u'0.008*"health" + 0.007*"body" + 0.006*"people" + 0.004*"help" + 0.004*"weight"')
()
Topic: 5
(5, u'0.019*"chocolate" + 0.018*"cake" + 0.011*"sugar" + 0.009*"butter" + 0.009*"cream"')
()
Topic: 6
(6, u'0.007*"just" + 0.007*"fashion" + 0.005*"year" + 0.005*"like" + 0.004*"new"')
()
Topic: 7
(7, u'0.005*"cancer" + 0.005*"world" + 0.004*"content" + 0.004*"like" + 0.004*"olympics"')
()
Topic: 8
(8, u'0.028*"com" + 0.021*"online" + 0.020*"www" + 0.018*"http" + 0.015*"guide"')
()
Topic: 9
(9, u'0.008*"apple" + 0.007*"like" + 0.006*"recipe" + 0.005*"just" + 0.004*"time"')
()
Topic: 10
(10, u'0.007*"food"

## Building a Word2Vec Model

In [28]:
from gensim.models.word2vec import Word2Vec

# Setup the body text 
text = data.body.dropna().map(lambda x: x.split()) 

from gensim.models import Word2Vec 
model = Word2Vec(text, size=100, window=5, min_count=5, workers=4)

In [30]:
model.most_similar(positive=['chocolate', 'brownie'])

[(u'candy', 0.8733401894569397),
 (u'frosting', 0.8526742458343506),
 (u'cake', 0.8476889729499817),
 (u'buttercream', 0.8459649085998535),
 (u'ganache', 0.8397178053855896),
 (u'caramel', 0.8356987833976746),
 (u'chips', 0.8320656418800354),
 (u'pudding', 0.8314386606216431),
 (u'marshmallows', 0.8285681009292603),
 (u'fudge', 0.8231680393218994)]

## Tweet Data

In [32]:
# Loading the tweet data
filename = '../dataset/captured-tweets.txt'
tweets = []
for tweet in codecs.open(filename, 'r', encoding="utf-8"):
    tweets.append(tweet)
    
# Setting up spacy
nlp_toolkit = English()

## Exercise 1a

Write a function that can take a take a sentence parsed by `spacy` and identify if it mentions a company named 'Google'. Remember, `spacy` can find entities and codes them as `ORG` if they are a company. Look at the slides for class 13 if you need a hint:

### Bonus (1b)

Parameterize the company name so that the function works for any company.

In [42]:
def mentions_company(parsed):
    parsed = nlp_toolkit(parsed)
    # Return True if the sentence contains an organization and that organization is Google
    #print parsed
    for entity in parsed.ents:
        if 'google' in entity.text.lower():
            return True
    # Otherwise return False
    return False


# 1b
def mentions_company(parsed, search_term='Google'):
    parsed = nlp_toolkit(parsed)
    # Return True if the sentence contains an organization and that organization is Google
    #print parsed
    for entity in parsed.ents:
        if search_term.lower() in entity.text.lower():
            return True
    # Otherwise return False
    return False


relevant_tweets=[]
for tweet in tweets:
    if mentions_company(tweet):
        relevant_tweets.append(tweet)

## Exercise 1c

Write a function that can take a sentence parsed by `spacy` 
and return the verbs of the sentence (preferably lemmatized)

In [50]:
# blank

## Exercise 1d
For each tweet, parse it using spacy and print it out if the tweet has 'release' or 'announce' as a verb. You'll need to use your `mentions_company` and `get_actions` functions.

In [51]:
def get_actions(parsed):
    actions = []
    # Your code here
    parsed = nlp_toolkit(parsed)
    for (i, word) in enumerate(parsed): 
        if word.lemma_ =='announce' or word.lemma_ =="release":
            return True
    return False


for tweet in relevant_tweets:
    if get_actions(tweet):
        print tweet

Google &amp; Ford rumored to announce partnership at CES https://t.co/zOgm1NjHhD https://t.co/Gzx81ujqVC

Google's Project Ara Spiral is expected to be released next year January https://t.co/prycPMuGsG

Google and Ford to announce partnership on self-driving cars at CES - Fudzilla (blog) https://t.co/6woe56G22Q

Google and Ford to announce partnership on self-driving cars at CES - Fudzilla (blog) https://t.co/4hERVJ4zZK

Redesigned Google Glass published on FCC website: release date, Price and features - Tampa Bay Review https://t.co/Vdwr4afx3E www.GlassRoo…



## Exercise 1e
Write a function that identifies countries - HINT: the entity label for countries is GPE (or GeoPolitical Entity)



In [55]:
def mentions_country(parsed, country):
    parsed = nlp_toolkit(parsed)
    for word in parsed:
        if word.text == 'country' and word.ent_type_ == 'GPE':
            return True
    return False

for tweet in tweets:
    if mentions_country(tweet, 'Japan'):
        print tweet

## Exercise 1f

Re-run (d) to find country tweets that discuss 'Iran' announcing or releasing.


In [None]:
for tweet in tweets:
    parsed = nlp_toolkit(tweet)
    pass

## Exercise 2
Build a `word2vec` model of the tweets we have collected using `gensim`.

### Exercise 2a:
First take the collection of tweets and tokenize them using spacy.

* Think about how this should be done. 
* Should you only use upper-case or lower-case? 
* Should you remove punctuations or symbols? 

In [None]:
text_split = [[x.text if x.pos != spacy.parts_of_speech.VERB else x.lemma_ 
                for x in nlp_toolkit(t)] for t in tweets]


### Exercise 2b:
Build a `word2vec` model.
Test the window size as well - this is how many surrounding words need to be used to model a word. What do you think is appropriate for Twitter? 

In [None]:
model = Word2Vec(text_split, size=100, window=4, min_count=5, workers=4)

### Exercise 2c:
Test your word2vec model with a few similarity functions. 
* Find words similar to 'Syria'.
* Find words similar to 'war'.
* Find words similar to "Iran".
* Find words similar to 'Verizon'. 



In [None]:
model.most_similar(positive=['Syria'])

# Exercise 2d

Adjust the choices / parameters in (b) and (c) as necessary.


## Exercise 3

Filter tweets to those that mention 'Iran' or similar entities and 'war' or similar entities.
* Do this using just spacy.
* Do this using word2vec similarity scores.

In [None]:
# Using spacy
for tweet in tweets:
    parsed = nlp_toolkit(tweet)
    pass

In [None]:
# Using word2vec similarity scores
for tweet in tweets[:200]:
    parsed = nlp_toolkit(tweet)
    pass
