# Import Frameworks

In [1]:
import pandas as pd
import numpy as np
from itertools import islice
import json
import matplotlib.pyplot as plt
import re
from urllib.request import urlopen
%matplotlib inline
import time
from bs4 import BeautifulSoup
import string
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
from nltk.corpus import stopwords
from gensim import corpora, models
from pprint import pprint
import gensim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from gensim.models import Word2Vec
from nltk.cluster import KMeansClusterer
from sklearn import cluster
from sklearn import metrics
nltk.download('punkt')
from nltk.stem.snowball import SnowballStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cliff\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cliff\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Some predefinitions

Best to pre-compile regexes that will be re-used often

In [2]:
wordnet_lemmatizer = WordNetLemmatizer()
re_url = re.compile(r'https?://[^\s<>"]+|www\.[^\s<>"]+')
re_hashtag = re.compile(r'#[^\s<>"]+')
re_mention = re.compile(r'@[^\s<>"]+')
tokenizer = RegexpTokenizer(r'\w+[\'‘’]?\w?') #modified to correctly extract "wouldn't" etc.
stopWords = set(stopwords.words('english'))

In [3]:
JSONlength = 0
with open('geotagged_tweets_20160812-0912.jsons') as myfile:
    for line in myfile:
        JSONlength += 1
print(JSONlength)

657307


# Scrape US States

### Check length of json contents

In [4]:
url = "https://simple.wikipedia.org/wiki/List_of_U.S._states"
response = urlopen(url)
soup = BeautifulSoup(response, 'html.parser')
table = soup.find("table",{"class":"wikitable"})
rows = table.find_all('tr')

stateslong = []
statesshort = []

for row in rows:
    cells = row.find_all('td')
    if len(cells) > 0:
        statesshort.append(cells[1].text.replace("\n",""))
        stateslong.append(cells[2].text.replace("\n",""))
        
stateslongnocaps = list(map(str.lower, stateslong))

# Load file a fill dataframe

In [5]:
#init dataframe
df = pd.DataFrame(0, index=np.arange(JSONlength), columns=['lang', 'country', 'state', 'city', 'text', 'mentions'], dtype = 'str')

In [6]:
with open('geotagged_tweets_20160812-0912.jsons') as myfile:
    for i, line in enumerate(myfile):
        try:
            line = json.loads(line)
        except Exception as e:
            print (e)
            continue
        
        # get row to fill
        a = df.iloc[i]
        
        # extract tweet language
        a['lang'] = line['lang'] 
        
        # extract country code
        try:
            a['country'] = line['place']['country_code']
        except Exception as e:
            print (e)
            continue
        
        # extract US states if possible
        try:
            if(a['country'] == 'US'):
                #use strip to remove front/back spaces
                a['city'], a['state'] = list(map(str.strip, line['place']['full_name'].split(',')))
                # checking against USA as state
                if a['state'] == "USA":
                    a['state'] = None
                # checking against state as city
                if a['city'].lower() in stateslongnocaps:
                    stateIndex = stateslongnocaps.index(a['city'].lower())
                    a['state'] = statesshort[stateIndex]
                    a['city'] = None
                elif a['city'] in statesshort:
                    a['state'] = a['city']
                    a['city'] = None
            else:
                a['state'], a['city'] = [None, None]
        except Exception as e:
                # we are now losing many cases of different formats
                # I'll write another regex to pick up some more
                # - Floris
                continue
        
        mentions = line['entities']['user_mentions']
        mentionlist = []
        for item in mentions:
            mentionlist.append(item['name'])
            
        a['mentions'] = mentionlist
            
        a['text'] = line['text']

'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not 

# Define tweet text pre-processing functions

In [8]:
def extract_links(text):
    match = re_url.findall(text)
    if match:
        return match
    return ''

def extract_hashtags(text):
    match = re_hashtag.findall(text)
    if match:
        return match
    return ''

def remove_links(text):
    return re_url.sub("",text)

def remove_hashtag_mentions(text):
    text = re_hashtag.sub("",text)
    return re_mention.sub("",text)

def tokenize(text):
    tokens = tokenizer.tokenize(text.lower())
    #remove stop words
    return [token for token in tokens if token not in stopWords]
    
def lemmatize(tokens):
    return [wordnet_lemmatizer.lemmatize(token) for token in tokens]

def get_tokens(text):
    text = remove_links(text)
    text = remove_hashtag_mentions(text)
    tokens = tokenize(text)
    #lemmas = lemmatize(tokens) #skip for now
    return " ".join(tokens)

def raw_tokens(text):
    text = remove_links(text)
    text = remove_hashtag_mentions(text)
    text = tokenize(text)
    tokens = lemmatize(text)
    return [token for token in tokens if token not in stopWords]

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    stemmer = SnowballStemmer("english")
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [9]:
df['links'] = df['text'].apply(lambda text: extract_links(text))

df['hashtags'] = df['text'].apply(lambda text: extract_hashtags(text))

df['relevant_tokens'] = df['text'].apply(lambda text: get_tokens(text))


In [10]:
df[['country','state','text','relevant_tokens']].head(25)

Unnamed: 0,country,state,text,relevant_tokens
0,US,MO,@theblaze @realDonaldTrump https://t.co/TY9DlZ...,
1,US,LA,@BarackObama \n@FBI\n@LORETTALYNCH \nALL IN CO...,collusion together
2,US,MO,@theblaze @realDonaldTrump https://t.co/n050DB...,
3,AU,,@HillaryClinton he will do in one year all the...,one year things done eight
4,US,MD,#CNN #newday clear #Trump deliberately throwin...,clear deliberately throwing race 2007 knew des...
5,US,CA,"@realDonaldTrump, you wouldn't recognize a lie...",recognize lie came mouth continually
6,GB,,#Trump2016 #TrumpPence16 #MakeAmericaGreatAgai...,
7,US,NJ,"""Kid, you know, suing someone? Thats the most ...",kid know suing someone thats beautiful thing 1...
8,US,TX,@HillaryClinton you ARE the co-founder of ISIS...,co founder isis crooked evil lying witch live
9,AU,,@Geraldanthro @NeilTurner_ @realDonaldTrump wa...,want comparison try maimed vets pre amp post i...


In [12]:
df.head(5)
#df.to_excel('geomapped_tweets.xlsx')

Unnamed: 0,lang,country,state,city,text,mentions,links,hashtags,relevant_tokens
0,und,US,MO,Frontenac,@theblaze @realDonaldTrump https://t.co/TY9DlZ...,"[TheBlaze, Donald J. Trump]",[https://t.co/TY9DlZ584c],,
1,en,US,LA,Baton Rouge,@BarackObama \n@FBI\n@LORETTALYNCH \nALL IN CO...,"[Barack Obama, FBI, AG Loretta Lynch, Donald J...",[https://t.co/5GMNZq40V3],"[#NOJUSTICE, #TrumpPence]",collusion together
2,und,US,MO,Frontenac,@theblaze @realDonaldTrump https://t.co/n050DB...,"[TheBlaze, Donald J. Trump]",[https://t.co/n050DBSpv0],,
3,en,AU,,,@HillaryClinton he will do in one year all the...,[Hillary Clinton],,,one year things done eight
4,en,US,MD,Baltimore,#CNN #newday clear #Trump deliberately throwin...,[],,"[#CNN, #newday, #Trump, #ISIS]",clear deliberately throwing race 2007 knew des...


In [13]:
#create a dictionary
tokens = df['text'].apply(lambda text: raw_tokens(text))
dictionary = corpora.Dictionary(tokens)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 collusion
1 together
2 done
3 eight
4 one
5 thing
6 year
7 2007
8 clear
9 deliberately
10 destabilization


In [12]:
#remove extemes
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [13]:
#bag of words approach
#bow_corpus = [dictionary.doc2bow(token) for token in tokens]
#bow_doc_4310 = bow_corpus[4310]
#for i in range(len(bow_doc_4310)):
#    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
#                                               dictionary[bow_doc_4310[i][0]], 
#bow_doc_4310[i][1]))
#from gensim import corpora, models    
#lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)
#for idx, topic in lda_model.print_topics(-1):
#    print('Topic: {} \nWords: {}'.format(idx, topic))

##TF_IDF approach
bow_corpus = [dictionary.doc2bow(token) for token in tokens]
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
for doc in corpus_tfidf:
    pprint(doc)
    break
import gensim
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

[]
Topic: 0 Word: 0.011*"lol" + 0.009*"go" + 0.007*"trump" + 0.007*"yep" + 0.007*"right" + 0.006*"never" + 0.005*"amp" + 0.005*"u" + 0.005*"hillary" + 0.004*"wall"
Topic: 1 Word: 0.011*"trump" + 0.009*"like" + 0.009*"lie" + 0.008*"hillary" + 0.007*"know" + 0.007*"vote" + 0.007*"u" + 0.006*"one" + 0.006*"look" + 0.006*"think"
Topic: 2 Word: 0.021*"deplorable" + 0.011*"l" + 0.011*"tax" + 0.009*"check" + 0.009*"tweet" + 0.007*"return" + 0.006*"release" + 0.006*"trump" + 0.005*"wow" + 0.005*"u"
Topic: 3 Word: 0.081*"0" + 0.020*"2016" + 0.019*"f" + 0.018*"fine" + 0.017*"today" + 0.017*"pressure" + 0.017*"30" + 0.016*"wind" + 0.016*"temp" + 0.016*"ky"
Topic: 4 Word: 0.009*"liar" + 0.008*"truth" + 0.006*"trump" + 0.006*"amp" + 0.005*"question" + 0.005*"u" + 0.005*"lie" + 0.005*"hillary" + 0.004*"medium" + 0.004*"like"
Topic: 5 Word: 0.022*"de" + 0.015*"la" + 0.014*"que" + 0.012*"love" + 0.012*"true" + 0.010*"el" + 0.010*"en" + 0.008*"un" + 0.007*"con" + 0.006*"se"
Topic: 6 Word: 0.010*"amp" +

In [11]:
#Evaluating model performance
for index, score in sorted(lda_model_tfidf[bow_corpus[310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))
    
    
#testing model on unseen data
new_tweet = 'The art of war is not something to be celebrated. #realshit'
bow_vector = dictionary.doc2bow(raw_tokens(new_tweet))
for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))

NameError: name 'lda_model_tfidf' is not defined

In [None]:
stemmer = SnowballStemmer("english")
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in df['text']:
    
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)
    
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
print ('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')

print (vocab_frame.head())


In [None]:
#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.9, max_features=100000,
                                 min_df=0.1, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(text) #fit the vectorizer to synopses
print(tfidf_matrix.shape)

terms = tfidf_vectorizer.get_feature_names()

from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)
print

from sklearn.cluster import KMeans
num_clusters = 10
km = KMeans(n_clusters=num_clusters)
%time km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

dframe = { 'country': country, 'text': text, 'cluster': clusters,'tokens': allwords_stemmed }
frame = pd.DataFrame(dframe, index = [clusters] , columns = ['country', 'text' 'cluster', 'tokens'])

In [None]:
from __future__ import print_function

print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :]5: #replace 6 with n words per cluster
        print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print() #add whitespace
    print() #add whitespace
    
    print("Cluster %d tokens:" % i, end='')
    for title in frame.ix[i]['tokens'].values.tolist():
        print(' %s,' % title, end='')
    print() #add whitespace
    print() #add whitespace
    
print()
print()

# Sentiment analysis 

In [20]:
# Download movie rewiews from nltk (tagged with pos/neg sentiment) 
from nltk.corpus import movie_reviews as mr
from collections import defaultdict
nltk.download('popular')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\cliff\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\cmudict.zip.
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     C:\Users\cliff\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\gazetteers.zip.
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     C:\Users\cliff\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\genesis.zip.
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     C:\Users\cliff\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\gutenberg.zip.
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     C:\Users\cliff\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     C:\Users\cliff\AppData\Roaming\nl

KeyboardInterrupt: 

In [None]:
def word_feats(words):
    return dict([(word, True) for word in words])

In [8]:
# create list of movie reviews by sentiment
documents = defaultdict(list)
for i in mr.fileids():
    documents[i.split('/')[0]].append(i)

print(documents['pos'][:10]) # first ten pos reviews.
print(documents['neg'][:10]) # neg

['pos/cv000_29590.txt', 'pos/cv001_18431.txt', 'pos/cv002_15918.txt', 'pos/cv003_11664.txt', 'pos/cv004_11636.txt', 'pos/cv005_29443.txt', 'pos/cv006_15448.txt', 'pos/cv007_4968.txt', 'pos/cv008_29435.txt', 'pos/cv009_29592.txt']
['neg/cv000_29416.txt', 'neg/cv001_19502.txt', 'neg/cv002_17424.txt', 'neg/cv003_12683.txt', 'neg/cv004_12641.txt', 'neg/cv005_29357.txt', 'neg/cv006_17022.txt', 'neg/cv007_4992.txt', 'neg/cv008_29326.txt', 'neg/cv009_29417.txt']


In [9]:
# extract all words from movie reviews with given sentiment
import string
stop = stopwords.words('english')
documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mr.fileids()]

In [12]:
from pprint import pprint
from nltk.classify import NaiveBayesClassifier

In [16]:
# Input: list of training docs, output: all words in those docs.
def get_words_in_training(docs):
    all_words = []
    for (words, sentiment) in docs:
        all_words.extend(words)
    return all_words

# Input: list of words, output: features
def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features

word_features = get_word_features(get_words_in_training(documents))

In [18]:
# Input: list of tweets, output: feature list for that tweet
def extract_features(tweets):
    for tweet in tweets:
        document_words = tweet.split(' ')
        features = {}
        for word in word_features:
            features['contains(%s)' % word] = (word in tweet)
        return features

# get training set
training_set = nltk.classify.apply_features(extract_features, documents)

In [20]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

KeyboardInterrupt: 

TO DO: clean up features list, i.e. throw away useless features and words that don't show up in the tweets.