# Case 1 Predicting elections using twitter

### GOALS

1. Analyse the social media posts (i.e., tweets) related to the upcoming elections in the USA. Example questions that you should aim to answer include, but are not limited to:

- What is the sentiment towards the candidates? This will involve extracting sentiment from the textual content of the tweets, using one of the toolkits mentioned in the lectures, such as SentiStrength.

- Where are they more popular? The tweets can be associated with a particular geolocation, which can be further used to study the geographic distribution of political preferences.

- Which topics are they and their supporters talking about? In the second week of the use case we will talk in more detail about the topical analysis. Until then, you could start by e.g., analysing the hashtags associated with each of the candidates.

- How big is the divide between the supporters? Here you could, for example, analyse whether the users normally follow and interact with their preferred candidate and like-minded supporters or they are interacting with the opponent's social network as well.

2. Investigate the existence of a relation between voting preferences and various statistics, related to e.g. demographics, education, income, health care and religion.


In [None]:
# in order to execute in kike's computer... 
# if missing any submodule from nltk, run in python: nltk.download()

import math
from shapely.geometry import Point, mapping, shape
from mpl_toolkits.axes_grid1 import make_axes_locatable
from os import path
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
from shapely.geometry import Point, mapping, shape
from mpl_toolkits.axes_grid1 import make_axes_locatable
import random
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import urllib2
from bs4 import BeautifulSoup
from matplotlib.colors import rgb2hex
from descartes import PolygonPatch
from shapely.geometry import Polygon, MultiPolygon, shape
import shapely.affinity # <- test this for scaling
import matplotlib.colors as colors
import matplotlib.cm as cmx
import csv
from  matplotlib.pyplot import colorbar
import json
from pymongo import MongoClient
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize
from nltk.stem.snowball import SnowballStemmer
from textblob import TextBlob
# prepare for Python version 3x features and functions
from __future__ import division, print_function
# import packages for text processing and multivariate analysis
import re  # regular expressions
import nltk  # draw on the Python natural language toolkit
import pandas as pd  # DataFrame structure and operations
import numpy as np  # arrays and numerical processing
import scipy
import matplotlib.pyplot as plt  # 2D plotting
# terms-by-documents matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans  # cluster analysis by partitioning
from sklearn.decomposition import PCA  # principal component analysis

#### Load all tweets to MongoDB

In [None]:
# Steps to use MongoDB:
# 1. pip install pymongo
# 2. install MongoDB for windows
# 3. create folder C:\data\db
# 4. C:\Program Files\MongoDB\Server\3.2\bin>mongoimport.exe /collection:election_tweets 
        # D:\Master\FDS\works\work2\data\geotagged_tweets_20160812-0912.jsons
# 5. C:\Program Files\MongoDB\Server\3.2\bin>mongod.exe

client = MongoClient()
#client.server_info()  # test purpose
db = client.test

In [None]:
# check that we actually have a connection to the MongoDB
db.election_tweets[1]
for a in db.get_collection('election_tweets').find({}).limit(10):
    for b in a["entities"]["hashtags"]:
        print(b["text"])

#### Functions for preprocessing

In [None]:
# open list of 1000 common english words. Source Facebook uploaded by David Langerveld
DIR = "./words.txt"
with open(DIR) as f:
    common_words = f.read()

In [None]:
# define list of codes to be dropped from documents
# carriage-returns, line-feeds, tabs
codelist = ['\r', '\n', '\t']    

# contractions and other word strings to drop from further analysis, adding
# to the usual English stopwords to be dropped from the document collection
more_stop_words = ['cant','didnt','doesnt','dont','goes','isnt','hes',\
    'shes','thats','theres','theyre','wont','youll','youre','youve',\
    're','tv','g','us','en','ve','vg','didn','pg','gp','our','we',
    'll','film','video','name','years','days','one','two','three', 'amp',\
    'four','five','six','seven','eight','nine','ten','eleven','twelve'] 
# start with the initial list and add to it for movie text work 
# nltk.download('corpus')
stoplist = nltk.corpus.stopwords.words('english') + more_stop_words + common_words.split()

# maybe using BeautifulSoup
# text parsing function for creating text documents 
# there is more we could do for data preparation 
# stemming... looking for contractions... possessives... 
# but we will work with what we have in this parsing function
# if we want to do stemming at a later time, we can use
#     porter = nltk.PorterStemmer()  
# in a construction like this
#     words_stemmed =  [porter.stem(word) for word in initial_words]  
def text_parse(string):
    # replace non-alphanumeric with space 
    temp_string = re.sub('[^a-zA-Z]', '  ', string)    
    # replace codes with space
    for i in range(len(codelist)):
        stopstring = ' ' + codelist[i] + '  '
        temp_string = re.sub(stopstring, '  ', temp_string)      
    # replace single-character words with space
    temp_string = re.sub('\s.\s', ' ', temp_string)   
    # convert uppercase to lowercase
    temp_string = temp_string.lower()    
    # replace selected character strings/stop-words with space
    for i in range(len(stoplist)):
        stopstring = ' ' + str(stoplist[i]) + ' '
        temp_string = re.sub(stopstring, ' ', temp_string)        
    # replace multiple blank characters with one blank character
    temp_string = re.sub('\s+', ' ', temp_string)    
    return(temp_string)    

def RemoveHTMLTags(data):
    p = re.compile(r'<[^<]*?>')
    return p.sub('', data)

def RemoveLinks(data):
    return re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', "", data)

def extractLinks(data):
    p = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    return p.findall(data)

def removeMentions(data):
    return re.sub(r'(?<=^|(?<=[^a-zA-Z0-9-\.]))@([A-Za-z0-9_]+)', "", data)

def removeHashtags(data):
    return re.sub(r'(?<=^|(?<=[^a-zA-Z0-9-\.]))#([A-Za-z0-9_]+)', "", data)

def extractWebText(link):
    response = urllib2.urlopen(link)
    html = response.read()
    html = re.sub(r'\n\s*\n', '\r\n', html)
    soup = BeautifulSoup(html, 'html.parser')

    for elem in soup.find_all(['script', 'style']):
        elem.extract()

    return text_parse(RemoveHTMLTags(removeMentions(RemoveLinks(soup.get_text()))))

def stemText(clean_text):
    stemmer = SnowballStemmer("english", ignore_stopwords=True)
    return " ".join([str(stemmer.stem(word)) for word in clean_text.split()])

In [None]:
# creating a index 
db.election_tweets.create_index("place.country_code")

In [None]:
# code to segment in groups 
# we split in 3 groups: Hillary, Trump, Neutral
# remember that _id is the mongodb id and id is twitter's id
import time

t0 = time.time()

hillary = []
trump = []
neutral = []
tweets_count = 0
# defining hashtags that define groups
hillary_tags = ['nevertrump', 'imwithher', 'dumptrump','drumpf','trumptrainwreck', 'clintonfoundation', 'voteblue', 'hillary4us', 'notjustanywomen', 'trumpsodumb', 'hillnotjill']
trump_tags = ['crookedhillary','neverhillary', 'trumppence16','maga', "hillno", 'imnotwithher', 'clintonnewsnetwork', 'trumptrain', 'makeamericagreatagain', 'latinosfortrump', 'lockherup', 'gaysfortrump', 'trumpforpresident']
neutral_tags = ['donaldtrump','hillaryclinton', 'trump', 'trump', 'hillary', 'realdonaldtrump', 'jillnothill', 'hillaryclinton', 'election2016','jillstein', 'gogreen','greenparty','trump2016']

coll = db.election_tweets
#bulkop = coll.initialize_unordered_bulk_op()


# Using hashtags to split
for a in db.get_collection('election_tweets').find(
    {'$and':[
            {"place.country_code":'US'},
            { "clean" : { '$exists':False }}
        ]}).batch_size(100):
    h = 0
    t = 0
    n = 0
    tweets_count += 1
    if (tweets_count % 50000 == 0):
        print("%i tweets processed in %i seconds..." % (tweets_count, time.time() - t0))
    links = extractLinks(a["text"])
    
    # Extract and add the links to the document
    if links:
        retval = db.election_tweets.update_one({'id':a["id"]},{'$addToSet':{'links': {'$each': links} }})
    
    # Stemming
    #stemmer = SnowballStemmer("english", ignore_stopwords=True)
    #print(stemmer.stem("wordings"))
    
    
    clean_text = text_parse(RemoveHTMLTags(RemoveLinks(removeMentions(removeHashtags(a["text"])))))
    sentiment_text = TextBlob(clean_text).sentiment
    polarity = sentiment_text.polarity
    subjectivity = sentiment_text.subjectivity
    
    # Add clean text to the document
    if a["text"]:
        retval = db.election_tweets.update_one({'id':a["id"]}, {'$set':{'clean_text':clean_text,
                                                                        'subjectivity':subjectivity,
                                                                        'polarity': polarity,
                                                                        'clean' : True}}) 
    
    # Find out the group to which the tweet belongs to and add this information to the document
    tag = a["entities"]["hashtags"]
    for b in tag:
        text = b["text"]
        text = text.lower()
        if text in hillary_tags:
            h += 1
        elif text in trump_tags:
            t += 1
        elif text in neutral_tags:
            n += 1
    if h > t and h > n:
        hillary.append(a["id"])
        retval = db.election_tweets.update_one({'id':a["id"]},
        {
            '$set':{
                'group':"hillary"
                    }
        }
        )
    elif t > h and t > n:
        trump.append(a["id"])
        retval = db.election_tweets.update_one({'id':a["id"]},
        {
            '$set':{
                'group':"trump"
                    }
        }
        )
    elif n > t and n > h:
        neutral.append(a["id"])
        retval = db.election_tweets.update_one({'id':a["id"]},
        {
            '$set':{
                'group':"neutral"
                    }
        }
        )
    else:
        neutral.append(a["id"])
        retval = db.election_tweets.update_one({'id':a["id"]},
        {
            '$set':{
                'group':"neutral"
                    }
        }
        )

# Write operation in MongoDB
print("writing...")
#retval = bulkop.execute()
print("done!")
print(retval)
print("From %i tweets, %i are pro-hillary, %i are pro-trump, %i are neutral" % (tweets_count, len(hillary), len(trump), len(neutral)))
t1 = time.time()
total = t1-t0
print("%i total seconds" % (total))

### Sentiment toolkits comparison

In [None]:
num_tweets = len(tweets)
target = open("cleaned_tweets.txt", 'w')
cleaned_words = []

for a in db.get_collection('election_tweets').find(
    {'$and':[
            {"place.country_code":'US'},
            { "clean" : { '$exists':False }}
        ]}).limit(10000):
    cleaned_tweet = text_parse(RemoveHTMLTags(RemoveLinks(tweets[i]["text"])))
    cleaned_words.extend(cleaned_tweet.split())
    
    cleaned_tweet = text_parse(RemoveHTMLTags(RemoveLinks(tweets[i]["text"])))
    #print(cleaned_tweet)
    target.write(str(i))
    target.write('\t')
    target.write(a["clean_text"].strip().replace("\n", "").replace("\r", ""))
    target.write('\n')
    textblob_polarity = a["polarity"]

    sid = SentimentIntensityAnalyzer()
    ss = sid.polarity_scores(cleaned_tweet)
    vader_polarity = ss["compound"]

#    for k in sorted(ss):
#        print('{0}: {1}, '.format(k, ss[k]), end='')
#        print()

    print("%i\tTextBlob polarity score\t%f\tVader polarity score\t%f" % (int(i+1), float(textblob_polarity), float(vader_polarity)))
# in order to compare the results also with SentiStrength, we need to execute this:
# D:\Master\FDS\works\work2\sentistrength\sentistrength>java -jar 
#   SentiStrengthCom.jar sentidata D:/Master/FDS/works/work2/sentistrength/sentistrength/SentStrength_Data_December2015English/ 
#   input D:/Master/FDS/works/work2/cleaned_tweets.txt scale

# Then, we should append both the data from Vader and TextBlob with the one outputted by SentiStrength and do some statistics in
#   excel (tab separated data)

### plotting number of tweets vs timestamp

In [None]:
import matplotlib.pyplot as plt
import datetime
days = []
for tweet in db.get_collection('election_tweets').find(
    {'$and':[
            { "place.country_code":'US' },
            { "group" : 'trump' }
            #}
        ]}):
    date = datetime.datetime.fromtimestamp(float(unicode(tweet["timestamp_ms"])) / 1e3)
    days.append(date.day)
plt.figure()
plt.hist(days, bins= 31,color="green")
plt.xlabel('Time')
plt.ylabel('Number of tweets in database')
plt.show()

### Plotting a map with majority of groups of tweets with each color

We first are going to do the inverse lookup of the coordinates and gather the name of the State to which the tweet belongs

In [None]:
import pandas as pd
import numpy as np
coordinates = []
x = 0
for tweet in db.get_collection('election_tweets').find(
    {'$and':[
            { "place.country_code":'US' },
            { "group" : {'$exists':'true'}},
            { "polarity" : {'$exists':'true'}}
        ]}):
    if tweet['place']['bounding_box']['coordinates'][0][0]:
        coordinates.append([tweet['place']['bounding_box']['coordinates'][0][0][0],
                            tweet['place']['bounding_box']['coordinates'][0][0][1],
                            tweet['group'],
                            tweet['polarity'],
                            tweet['subjectivity']])
                            

print("%d coordinates found" % len(coordinates) )
print(coordinates[:22], "...")

In [None]:
BLUE = '#5599ff'
RED = '#ff0000'
title = "Balance of groups per state"

MAPS_DIR = '.'
with open(os.path.join(MAPS_DIR, 'states.geojson')) as rf:
    maps = json.load(rf)

state_score = {}
for feature in maps['features']:
    state_name = feature['properties'][u'STATE_NAME'] # state name
    polygon = shape(feature['geometry']) 
    try:
        if not state_score[state_name]:
            state_score[state_name] = [0, 0, 0, 0] # state score, total polarity, total subjectivity, total tweets
    except KeyError, e:
        state_score[state_name] = [0, 0, 0, 0]
    for point in coordinates:
        point_ = Point(point[0], point[1])
        if polygon.contains(point_):
            state_score[state_name][1] += point[3]
            state_score[state_name][2] += point[4]
            state_score[state_name][3] += 1
            if point[2] == "hillary":
                state_score[state_name][0] += -(0.5 + 0.5 * math.fabs(point[3]))
                #print("hill:", -(0.5 + 0.5 * math.fabs(point[3])))
            elif point[2] == "trump":
                state_score[state_name][0] += 0.5 + 0.5 * math.fabs(point[3])
                #print("trump:", 0.5 + 0.5 * math.fabs(point[3]))

In [None]:
min_state_score = 0
max_state_score = 0
for score in state_score.itervalues():
    if score[0] > 0:
        score[0] = math.log(1 + score[0])
    elif score[0] < 0:
        score[0] = -math.log(1 - score[0])
    if score[0] < min_state_score:
        min_state_score = score[0]
    elif score[0] > max_state_score:
        max_state_score = score[0]
        
print("min score is %f and max score is %f" % (min_state_score, max_state_score))

#### barplot

In [None]:
plt.bar(range(len(state_score)), [sc[0] for sc in state_score.itervalues()], align='center',color="white")
plt.grid()
plt.xticks(range(len(state_score)), state_score.keys())
locs, labels = plt.xticks()
plt.setp(labels, rotation=90)
plt.show()

In [None]:
fig = plt.figure(figsize=(20,10))
ax = fig.gca()

cm_ = plt.get_cmap('seismic') # Name of colormap we want to use
# Careful with normalizing, it will not automatically calculate percentages, it will only paint absolute values
cNorm  = colors.Normalize(vmin=min_state_score, vmax=max_state_score) # Range of values expected in order to be able to normalize any incoming data
scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=cm_) # Create the scalarMap itself

for feature in maps['features']:
    state_name = feature['properties'][u'STATE_NAME'] # state name
    state_fips = feature['properties'][u'STATE_FIPS'] # example of data to be used
    geometry = feature['geometry']
    s = shape(geometry)
    #colorVal = scalarMap.to_rgba(values[state_name])
    colorVal = scalarMap.to_rgba(state_score[state_name][0])

    if state_name == 'Hawaii': # make it bigger and move it to the right
        s = shapely.affinity.scale(s,xfact=3, yfact=3)
        s = shapely.affinity.translate(s, xoff=35, yoff=-6)
    elif state_name == 'Alaska': # make it a little bit smaller and move it down
        s = shapely.affinity.scale(s,xfact=0.5, yfact=0.5 )
        s = shapely.affinity.translate(s, xoff =28 , yoff=-35) 
    # print(state_name + " - " + total_hospitals[state_name])
    if geometry['type'] == 'Polygon':
        ax.add_patch(PolygonPatch(s, fc=colorVal,ec=BLUE, alpha=0.5, zorder=2))
    else:
        for g in s.geoms:
            ax.add_patch(PolygonPatch(g,fc=colorVal, ec=BLUE, alpha=0.5, zorder=2))
    plt.text(s.centroid.x, s.centroid.y,state_name, fontsize=6)

ax.axis('scaled')
plt.axis('off')
plt.title(title)
cmmapable = cmx.ScalarMappable(cNorm, cm_)
cmmapable.set_array(range(int(min_state_score),int(max_state_score)))
plt.colorbar(cmmapable, fraction=0.01, pad=0.04)
plt.show()

### Bag of words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cleaned_words_split_hillary = []
cleaned_words_split_trump = []
cleaned_words_split_neutral = []

for a in db.get_collection('election_tweets').find(
    {'$and':[
        { "place.country_code":'US' },
        { "clean_text" : { '$exists':'true' }},
        { "group" : { '$exists':'true' }
    }]}):
    if a and a["group"]=="hillary":
        cleaned_words_split_hillary.extend(a["clean_text"].split())
    elif a and a["group"]=="trump":
        cleaned_words_split_trump.extend(a["clean_text"].split())
    elif a and a["group"]=="neutral":
        cleaned_words_split_neutral.extend(a["clean_text"].split())
    else:
        pass

print("Hillary 1st 50 words: ")
print(cleaned_words_split_hillary[:50])

print("Trump 1st 50 words: ")
print(cleaned_words_split_trump[:50])

print("Neutral 1st 50 words: ")
print(cleaned_words_split_neutral[:50])

vectorizer_hillary = CountVectorizer(analyzer = "word",   
                             tokenizer = None,    
                             preprocessor = None, 
                             stop_words = None,   
                             max_features = 5000)
vectorizer_trump = CountVectorizer(analyzer = "word",   
                             tokenizer = None,    
                             preprocessor = None, 
                             stop_words = None,   
                             max_features = 5000)
vectorizer_neutral = CountVectorizer(analyzer = "word",   
                             tokenizer = None,    
                             preprocessor = None, 
                             stop_words = None,   
                             max_features = 5000)
data_features_hillary = vectorizer_hillary.fit_transform(cleaned_words_split_hillary)

In [None]:
data_features_trump = vectorizer_trump.fit_transform(cleaned_words_split_trump).toarray()
data_features_neutral = vectorizer_neutral.fit_transform(cleaned_words_split_neutral).toarray()

In [None]:
print(data_features_hillary.shape)

In [None]:
#data_features_hillary = data_features_hillary.toarray()

In [None]:
vocab_hillary = vectorizer_hillary.get_feature_names()
vocab_trump = vectorizer_trump.get_feature_names()
vocab_neutral = vectorizer_neutral.get_feature_names()
print(vocab_hillary)

In [None]:
import numpy as np
# Sum up the counts of each vocabulary word
dist = np.sum(data_features_hillary, axis=0)
vocabulary_count_hillary = []
# For each, print the vocabulary word and the number of times it 
# appears in the training set
for tag, count in zip(vocab_hillary, dist):
    vocabulary_count_hillary.append((count, tag))

vocabulary_count_hillary.sort(key = lambda tup: -tup[0])
#vocabulary_count.sort(key=lambda tup: tup[1])
print(str(vocabulary_count_hillary[0:50]))

# Sum up the counts of each vocabulary word
dist = np.sum(data_features_trump, axis=0)
vocabulary_count_trump = []
# For each, print the vocabulary word and the number of times it 
# appears in the training set
for tag, count in zip(vocab_trump, dist):
    vocabulary_count_trump.append((count, tag))

vocabulary_count_trump.sort(key = lambda tup: -tup[0])
#vocabulary_count.sort(key=lambda tup: tup[1])
print("\n" )
print(str(vocabulary_count_trump[0:50]))


# Sum up the counts of each vocabulary word
dist = np.sum(data_features_neutral, axis=0)
vocabulary_count_neutral = []
# For each, print the vocabulary word and the number of times it 
# appears in the training set
for tag, count in zip(vocab_neutral, dist):
    vocabulary_count_neutral.append((count, tag))

vocabulary_count_neutral.sort(key = lambda tup: -tup[0])
#vocabulary_count.sort(key=lambda tup: tup[1])
print("\n" )
print(str(vocabulary_count_neutral[0:50]))


### Word cloud

In [None]:
print(" ".join(cleaned_words))

In [None]:
# kike's try

from os import path
from wordcloud import WordCloud

# Read the whole text.
text = " ".join(cleaned_words)
# Generate a word cloud image
wordcloud = WordCloud(height=600, width=800).generate(text)

# Display the generated image:
# the matplotlib way:
import matplotlib.pyplot as plt
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
# take relative word frequencies into account, lower max_font_size
#'''
wordcloud = WordCloud(height=800, width=1100,max_font_size=70, relative_scaling=.3,background_color="white").generate(text)
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
#'''

### Topic analysis

#### term document matrix vectorization + euclidean distance + MDS

In [None]:
cleaned_text_split_hillary = []
cleaned_text_split_trump = []
cleaned_text_split_neutral = []

for a in db.get_collection('election_tweets').find(
    {'$and':[
        { "place.country_code":'US' },
        { "clean_text" : { '$exists':'true' }},
        { "group" : { '$exists':'true' }
    }]}):
    if a and a["group"]=="hillary":
        cleaned_text_split_hillary.extend(a["clean_text"].split())
    elif a and a["group"]=="trump":
        cleaned_text_split_trump.extend(a["clean_text"].split())
    elif a and a["group"]=="neutral":
        cleaned_text_split_neutral.extend(a["clean_text"].split())
    else:
        pass

In [None]:
tdm_method_hillary = CountVectorizer(max_features = 100, binary = True) # term document matrix
tdm_method_trump = CountVectorizer(max_features = 100, binary = True) # term document matrix
tdm_method_neutral = CountVectorizer(max_features = 100, binary = True) # term document matrix

examine_tdm_hillary = tdm_method_hillary.fit(cleaned_text_split_hillary)
examine_tdm_trump = tdm_method_trump.fit(cleaned_text_split_trump)
examine_tdm_neutral = tdm_method_neutral.fit(cleaned_text_split_neutral)

top_words_hillary = examine_tdm_hillary.get_feature_names()
top_words_trump = examine_tdm_trump.get_feature_names()
top_words_neutral = examine_tdm_neutral.get_feature_names()

In [None]:

from sklearn.metrics import euclidean_distances 
from sklearn.metrics.pairwise import linear_kernel as cosine_distances
from sklearn.metrics.pairwise import manhattan_distances as manhattan_distances

from sklearn import manifold  # multidimensional scaling
# get clean printing of the top words 
#print(map(lambda t: t.encode('ascii'), top_words))  # print sans unicode

# extract the terms-by-documents matrix 
# in scipy compressed sparse column format
sparse_tdm_hillary = tdm_method_hillary.fit_transform(cleaned_text_split_hillary)
sparse_tdm_trump = tdm_method_trump.fit_transform(cleaned_text_split_trump)
sparse_tdm_neutral = tdm_method_neutral.fit_transform(cleaned_text_split_neutral)
# convert sparse matrix into regular terms-by-documents matrix
tweet_tdm_hillary = sparse_tdm_hillary.todense()
tweet_tdm_trump = sparse_tdm_trump.todense()
tweet_tdm_neutral = sparse_tdm_neutral.todense()
# define the documents-by-terms matrix 
tweet_dtm_hillary = tweet_tdm_hillary.transpose()
tweet_dtm_trump = tweet_tdm_trump.transpose()
tweet_dtm_neutral = tweet_tdm_neutral.transpose()

# movies_distance_matrix = euclidean_distances(tweet_dtm)
# movies_distance_matrix = manhattan_distances(tweet_dtm)
# movies_distance_matrix = cosine_distances(tweet_dtm)
# for some reason, manhattan_distances yields all-zeros
tweet_distance_matrix_hillary = manhattan_distances(tweet_dtm_hillary)
tweet_distance_matrix_trump = manhattan_distances(tweet_dtm_trump)
tweet_distance_matrix_neutral = manhattan_distances(tweet_dtm_neutral)

# multidimensional scaling
mds_method_hillary = manifold.MDS(n_components = 2, random_state = 9999,\
    dissimilarity = 'precomputed')
mds_fit_hillary = mds_method_hillary.fit(tweet_distance_matrix_hillary)  
mds_coordinates_hillary = mds_method_hillary.fit_transform(tweet_distance_matrix_hillary) 

mds_method_trump = manifold.MDS(n_components = 2, random_state = 9999,\
    dissimilarity = 'precomputed')
mds_fit_trump = mds_method_trump.fit(tweet_distance_matrix_trump)  
mds_coordinates_trump = mds_method_trump.fit_transform(tweet_distance_matrix_trump) 

mds_method_neutral = manifold.MDS(n_components = 2, random_state = 9999,\
    dissimilarity = 'precomputed')
mds_fit_neutral = mds_method_neutral.fit(tweet_distance_matrix_neutral)  
mds_coordinates_neutral = mds_method_neutral.fit_transform(tweet_distance_matrix_neutral) 

Scatter plot of the distances between words frequencies

In [None]:
plt.figure()
plt.scatter(mds_coordinates_hillary[:,0],mds_coordinates_hillary[:,1],\
    facecolors = 'none', edgecolors = 'none')  # plots points in white (invisible)
labels = []
# ToDo define range
#for word in top:
#    labels.append(str(iyear)) 
for label, x, y in zip(top_words_hillary, mds_coordinates_hillary[:,0], mds_coordinates_hillary[:,1]):
    plt.annotate(label, (x,y), xycoords = 'data',  fontsize = 18)
plt.xlabel('First Dimension')
plt.ylabel('Second Dimension')
plt.title('Pro-Hillary tweets. Distance between terms')
plt.show() 

In [None]:
plt.figure()
plt.scatter(mds_coordinates_trump[:,0],mds_coordinates_trump[:,1],\
    facecolors = 'none', edgecolors = 'none')  # plots points in white (invisible)
labels = []
# ToDo define range
#for word in top:
#    labels.append(str(iyear)) 
for label, x, y in zip(top_words_trump, mds_coordinates_trump[:,0], mds_coordinates_trump[:,1]):
    plt.annotate(label, (x,y), xycoords = 'data')
plt.xlabel('First Dimension')
plt.ylabel('Second Dimension')   
plt.title('Pro-Trump tweets. Distance between terms')
plt.show() 

In [None]:
plt.figure()
plt.scatter(mds_coordinates_neutral[:,0],mds_coordinates_neutral[:,1],\
    facecolors = 'none', edgecolors = 'none')  # plots points in white (invisible)
labels = []
# ToDo define range
#for word in top:
#    labels.append(str(iyear)) 
for label, x, y in zip(top_words_neutral, mds_coordinates_neutral[:,0], mds_coordinates_neutral[:,1]):
    plt.annotate(label, (x,y), xycoords = 'data')
plt.xlabel('First Dimension')
plt.ylabel('Second Dimension') 
plt.title('Neutral tweets. Distance between terms')
plt.show() 

#### K-Means

In [None]:
cleaned_text_split_all = []

for a in db.get_collection('election_tweets').find(
    {'$and':[
        { "place.country_code":'US' },
        { "clean_text" : { '$exists':'true' }},
        { "group" : { '$exists':'true' }
    }]}):
    if a:
        cleaned_text_split_all.append(a["clean_text"])
    else:
        pass

In [None]:
cleaned_text_split_all

In [None]:
tdm_method = CountVectorizer(max_features = 200, binary = True)
examine_tweets_tdm = tdm_method.fit(cleaned_text_split_all)

top_words_all = examine_tweets_tdm.get_feature_names()

sparse_tweets_tdm = tdm_method.fit_transform(cleaned_text_split_all)
# convert sparse matrix into regular terms-by-documents matrix
tweets_tdm = sparse_tweets_tdm.todense()
# define the documents-by-terms matrix 
tweets_dtm = tweets_tdm.transpose()

In [None]:
from sklearn.cluster import KMeans

# classification of words into groups for further analysis
# use transpose of the terms-by-document matrix and cluster analysis
# try five clusters/groups of words
clustering_method = KMeans(n_clusters = 5, random_state = 9999) 
clustering_solution = clustering_method.fit(tweets_tdm)
cluster_membership = clustering_method.predict(tweets_tdm)
word_distance_to_center = clustering_method.transform(tweets_tdm)


# top words data frame for reporting k-means clustering results

In [None]:
len(top_words_data['cluster'])

In [None]:
len(top_words_data['word'])

In [None]:
len(top_words_data['dist_to_0'])

In [None]:
len(top_words_data['dist_to_1'])

In [None]:
len(top_words_data['dist_to_2'])

In [None]:
len(top_words_data['dist_to_3'])

In [None]:
len(top_words_data['dist_to_4'])

In [None]:
from pandas import DataFrame, Series
top_words_data_frame = pd.DataFrame(dict([ (k,Series(v)) for k,v in top_words_data.iteritems() ]))

In [None]:
# top words data frame for reporting k-means clustering results
#top_words_data = {'cluster': cluster_membership,\
top_words_data = {'word': top_words_all, 'cluster': cluster_membership,\
    'dist_to_0': word_distance_to_center[0:,0],\
    'dist_to_1': word_distance_to_center[0:,1],\
    'dist_to_2': word_distance_to_center[0:,2],\
    'dist_to_3': word_distance_to_center[0:,3],\
    'dist_to_4': word_distance_to_center[0:,4]}
distance_name_list = ['dist_to_0','dist_to_1','dist_to_2','dist_to_3','dist_to_4']    
#top_words_data_frame = pd.DataFrame(top_words_data).from_dict(top_words_data, orient='index')
for cluster in range(5):
    words_in_cluster =\
        top_words_data_frame[top_words_data_frame['cluster'] == cluster] 
    sorted_data_frame =\
        top_words_data_frame.sort_index(by = distance_name_list[cluster],\
        ascending = True)
    print('\n Top Words in Cluster :',cluster,'------------------------------')
    print(sorted_data_frame.head())

we gave up with this approach as we don't think topic groupping is not the best idea in this context

#### Assorted terms by group, font-size relative to frequency

In [None]:
# end of https://de.dariah.eu/tatom/topic_model_visualization.html

In [None]:
cleaned_text_hillary = []
cleaned_text_trump = []
cleaned_text_neutral = []

for a in db.get_collection('election_tweets').find(
    {'$and':[
        { "place.country_code":'US' },
        { "clean_text" : { '$exists':'true' }},
        { "group" : { '$exists':'true' }
    }]}):
    if a and a["group"]=="hillary":
        cleaned_text_hillary.append(a["clean_text"])
    elif a and a["group"]=="trump":
        cleaned_text_trump.extend(a["clean_text"])
    elif a and a["group"]=="neutral":
        cleaned_text_neutral.extend(a["clean_text"])
    else:
        pass
# take relative word frequencies into account, lower max_font_size
#'''

#### Word cloud with USA flag shape and group-color segmentation

In [None]:
def color_func(word, font_size, position, orientation, random_state=None, **kwargs):
    if (cleaned_text_split_trump).count(word) > (cleaned_text_split_hillary).count(word):
        return "rgb(255,0,0)"
    elif (cleaned_text_split_trump).count(word) < (cleaned_text_split_hillary).count(word):
        return "rgb(0,0,255)"
    else:
        return "rgb(255, 255, 255)"
    
        #return "hsl(0, 0%%, %d%%)" % random.randint(60, 100)


D_DIR = 'D:\Master\FDS\works\work2\wordcloud'
usa_mask = np.array(Image.open(path.join(D_DIR, "eeuumask.jpg")))
usa_mask_color = np.array(Image.open(path.join(D_DIR, "eeuumaskcolor.jpg")))

wordcloud = WordCloud(mask=usa_mask_color,
                      height=800, width=1100,max_font_size=200,
                      relative_scaling=.05, background_color="white").generate(" ".join(cleaned_text_split_trump + cleaned_text_split_hillary))


#image_colors = ImageColorGenerator(usa_mask_color)  # trying different approaches with WordCloud

plt.imshow(wordcloud.recolor(color_func=color_func, random_state=3))
plt.axis("off")
plt.show()