In [None]:
##########################################################################
# author: patricewangen
# created: 17 March 2020
# last_edited: 17 March 2020
##########################################################################

# TODO
# (1) Homework Solutions (Week 6)

In [3]:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Exercise 06
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (1) Recreate the df_joint dataframe with 3 rows containing the 
# collected tweets from Boris, Theresa, and Donald.

# First let's load the love-triangle dataframe, subset it to the
# columns 'tweet_text' and 'user_handle' where no 'tweet_text' entry
# is missing. Then reset the index to prevent confusion later on.
import pandas as pd
import feather

df = feather.read_dataframe("DATA/love-triangle.feather")
df = df.loc[df['tweet_text'].notna(), ['user_handle', 'tweet_text']]
df = df.reset_index(drop=True)
df

Unnamed: 0,user_handle,tweet_text
0,BorisJohnson,Corbyn and his friends in Parliament don’t tru...
1,BorisJohnson,Fantastic to address our party faithful at the...
2,theresa_may,You want this stage of the Brexit process to b...
3,eucopresident,EU27 unanimously agrees on its response to UK’...
4,BorisJohnson,I’m deeply honoured to have secured more than ...
...,...,...
91,BorisJohnson,I’m standing to be Leader of the Conservative ...
92,BorisJohnson,Jeremy Corbyn wants to cancel the referendum a...
93,BorisJohnson,Let’s come together and get Brexit done on Oct...
94,BorisJohnson,Thank you @JSHeappey for the invitation to spe...


In [4]:
# In order to get all unique values in the column
# 'user_handle', we use the .unique() method on this
# columns, and save the result in the new object handles
handles = df['user_handle'].unique()
handles

array(['BorisJohnson', 'theresa_may', 'eucopresident'], dtype=object)

In [5]:
# In the last script, we did the following steps with a for loop.
# As always in python, tough: There are many ways to do the same thing.
# So, here's how you would achieve the same result with list comprehension.

# Next, we create an dataframe containing all the handles to which
# we can add columns later on.
df_joint = pd.DataFrame({"user_handle": handles})
df_joint

Unnamed: 0,user_handle
0,BorisJohnson
1,theresa_may
2,eucopresident


In [6]:
# Then we add a new column with the joint tweet_text for each handle
# via list comprehension. 
df_joint['tweet_text'] = [" ".join(df.loc[df['user_handle'] == handle, 'tweet_text']) for handle in handles]
df_joint 

Unnamed: 0,user_handle,tweet_text
0,BorisJohnson,Corbyn and his friends in Parliament don’t tru...
1,theresa_may,You want this stage of the Brexit process to b...
2,eucopresident,EU27 unanimously agrees on its response to UK’...


In [None]:
# (2) Use spacy to pre-process each tweet collection: Get the lemmas
# and keep only nouns, adjectives, adverbs, and verbs. Use the spacy
# documentation to find out which tags to use.

# If we followed the installing instructions for spacy from the last 
# session (06_NLP-3), we can load the big english models with the
# word vectors into a new object nlp, which we can then use to
# process texts
import spacy
nlp = spacy.load("en_default")

In [14]:
nlp.explain

AttributeError: 'English' object has no attribute 'explain'

In [20]:
for row in range(len(df_joint)):
    # Once we passed a string object through the nlp() function, we get
    # new spacy-specific object that contains information on each token,
    # and more. 
    doc = nlp(df_joint.loc[row, 'tweet_text'])
    
    # To access the lemmatized version of a token, we use 
    # .lemma_ on the token object. To access the part-of-speech (pos) tag
    # for each token, we use .pos_ on the token object. We can use both
    # to filter out certain tags in a list comprehension applied to each
    # parsed string object:
    doc = [token.lemma_ for token in doc if token.pos_ in ["NOUN", "ADJ", "ADV", "VERB"]]
    
    # You might have noticed that this will keep the '#' sign as a separate 
    # token, as well as URLs. To filter these out again, we can use what
    # we learned in the previous pre-processing sessions:
    
    # Return every token in doc unless tha token is "#"
    doc = [token for token in doc if token != "#"]
    
    # Return every token in doc unless you find the sub-string "https://"
    # in the token string.
    doc = [token for token in doc if not "https://" in token]
    
    # Now, we can join this list of individual tokens into a single string
    # again, separated by whitespace. Save this in the column 'processed'
    # in the respective row of the for-loop iteration
    df_joint.loc[row, 'processed'] = " ".join(doc)
df_joint

Unnamed: 0,user_handle,tweet_text,processed
0,BorisJohnson,Corbyn and his friends in Parliament don’t tru...,friend trust make decision let put people more...
1,theresa_may,You want this stage of the Brexit process to b...,want stage process over do agree side let expl...
2,eucopresident,EU27 unanimously agrees on its response to UK’...,unanimously agree response request will now me...


In [23]:
# (3) Turn the pre-processed documents into a document term matrix
# with the CountVectorizer and use this to compare the cosine 
# similarity between Boris, Theresa, and Donald's Brexit tweets.

from sklearn.feature_extraction.text import CountVectorizer

# As before, define a strict whitepsace tokenizer to use in the
# vectorizer to prevent it from messing with your pre-processing.
def split_tok(text):
    return(text.split())

vec = CountVectorizer(tokenizer=split_tok)
dtm = vec.fit_transform(df_joint['processed'])
dtm

<3x472 sparse matrix of type '<class 'numpy.int64'>'
	with 594 stored elements in Compressed Sparse Row format>

In [25]:
# In line with our prior beliefs, this approach to pre-processing
# puts Boris and Theresa closer to each other (0.45) than Boris
# and Donald (0.16), but also recognizes that Theresa is slightly
# closer to Donald than Boris (0.22).
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(dtm)

array([[1.        , 0.44648143, 0.16338609],
       [0.44648143, 1.        , 0.22005942],
       [0.16338609, 0.22005942, 1.        ]])

In [29]:
# (4) Take the five tweets from the slides and turn them into word
# vectors with spacy (list of lists). Here are the texts...
tweets = ["""#Brexit means that we gain back sovereignty and that the UK will free itself from the detached Brussels elite. Boris will make Britain great again!!!""",
         """Brexit means Brexit! The EU should stop their bullshit and accept the new reality #MakeBritainGreatAgain""",
         """Brexit just means that we’ll have to wait in longer cues at passport control. So much for #MakeBritainGreatAgain... “Great” Britain my ass...""",
         """How are we gaining back sovereignty with Brexit?!? This is the greatest bullshit I’ve heard in a long time. I mean, Boris is completely detached from reality.""",
         """”Brexit means Brexit” – what does that even mean?!?!"""]

# So, let's first create an empty list to which we will add the 
# individual document vectors for each tweet with the .append()
# method internal to python list objects.
doc_vecs = []

for t in tweets:
    doc = nlp(t)
    doc_vecs.append(doc.vector)

# What we get is a list of 5 elements, each of which is a 300-dimensional
# document vector. These document vectors are the average of all the word
# vectors in each tweet based on spacy's pre-trained word embeddings.
doc_vecs

[array([-5.73753268e-02,  1.84684217e-01, -1.43384309e-02, -4.46681902e-02,
         1.41866267e-01, -9.53433663e-02, -9.45333019e-03, -1.51649630e-02,
         1.71476081e-02,  1.87934303e+00, -2.24433094e-01, -3.18026654e-02,
        -3.59606892e-02,  2.73565259e-02, -2.55980203e-03, -6.50298223e-02,
        -1.13845274e-01,  1.04823828e+00, -9.46318358e-02,  4.28462476e-02,
         8.88825879e-02, -7.09237978e-02,  5.09749725e-02, -6.00767927e-03,
        -9.68245789e-02,  2.80729849e-02, -2.99674328e-02, -3.58855948e-02,
        -3.15343104e-02,  4.87818308e-02, -1.03439681e-01,  1.69913247e-01,
        -1.34167382e-02,  1.31881967e-01,  7.10295066e-02, -3.43867168e-02,
        -3.98880243e-03,  1.86727606e-02, -5.58238178e-02, -4.19734381e-02,
         4.44168299e-02,  8.90220553e-02,  3.05377916e-02, -7.31205707e-03,
         5.38969561e-02,  7.95087963e-02, -5.91682382e-02, -5.87873685e-04,
        -6.48069307e-02,  1.02574393e-01, -2.10222844e-02,  1.23302080e-01,
        -6.2

In [36]:
# (5) Let's try to recreate the dimensions projection from the Kozlowski
# article. Create a new vector called leave_remain that averages the 
# difference between the vectors from the tweets above. Use np.mean()
# over a list of all the possible combinations of leave - remain tweets
# you can find. 
# Tip: Use the axis argument of np.mean() to make sure that you retain the
# 300-dimensional shape of the vectors.

# So, we can use simple mathematical operations with these 300-dimensional 
# document vectors. For example, we can subtract one vector from another,
# and get the difference for each number in the vector, hopefully representing
# some information on the semantic and syntactic difference between these
# two documents:
doc_vecs[0] - doc_vecs[3]

array([-0.01456737,  0.00026548,  0.08464395,  0.03684132,  0.05973561,
       -0.07610595, -0.05820116,  0.11105946, -0.01186613, -0.14228678,
       -0.04856575, -0.06812349, -0.02447839,  0.00828139,  0.07108555,
       -0.02412979, -0.05622398,  0.05548227,  0.04681283,  0.05150695,
        0.02510512,  0.03780162,  0.05026171, -0.02900234, -0.05973253,
        0.04943634,  0.0137206 , -0.04708568, -0.04366104,  0.01840056,
       -0.04563695,  0.04358657,  0.05429286,  0.07643047, -0.02471695,
       -0.02302156, -0.04564812,  0.00467395,  0.04765047,  0.03465876,
       -0.0047119 ,  0.04900329,  0.02146515, -0.05601452,  0.02651796,
       -0.05637574,  0.08207415,  0.0576821 , -0.03189872,  0.08818001,
        0.02134376,  0.01880169,  0.06353053,  0.01421843,  0.07747495,
       -0.00875435,  0.02215547,  0.1488476 , -0.14048816,  0.00790052,
        0.02452351,  0.02418052,  0.08279602, -0.02371045, -0.0222975 ,
        0.09650191,  0.01192756,  0.05136617,  0.0626407 ,  0.02

In [38]:
# If we subtract the document vectors of remain tweets from document
# vectors of leave tweets, we should get a new vector that contains 
# information on the semantic and syntactic difference between such 
# tweets. – So, let's get all of these 'difference' vectors from our
# two leave tweets [0,1] and three remain tweets [2,3,4], and store 
# this in a list
all_differences = [doc_vecs[0] - doc_vecs[2],
                  doc_vecs[0] - doc_vecs[3],
                  doc_vecs[0] - doc_vecs[4],
                  doc_vecs[1] - doc_vecs[2],
                  doc_vecs[1] - doc_vecs[3],
                  doc_vecs[1] - doc_vecs[4]]
all_differences

[array([ 0.02107936,  0.01810402,  0.07412811,  0.00901265,  0.07108568,
        -0.08516826, -0.09161538,  0.11113922,  0.06176272,  0.03466976,
        -0.02932815, -0.10588561, -0.17337392,  0.06587249,  0.10025212,
        -0.07168448, -0.06148237, -0.00498521,  0.04460984,  0.01489728,
         0.09577478, -0.04146655,  0.02889952,  0.05574762, -0.04936552,
         0.02384836,  0.01592853,  0.04330487, -0.11681211,  0.09520814,
        -0.02668843,  0.07700412, -0.05759039,  0.06493492, -0.01723377,
        -0.02243357, -0.10799462, -0.04495156, -0.04131602, -0.03369019,
         0.08099426,  0.09294347,  0.04848008,  0.0336045 ,  0.00291575,
         0.02475733,  0.06506121, -0.03850925, -0.05028093,  0.0248425 ,
         0.00922215,  0.06852964,  0.01421449, -0.03090272,  0.06530939,
        -0.03446384,  0.01266321,  0.10456324, -0.04915629, -0.02626279,
        -0.02387024,  0.04065874,  0.07915448, -0.0389961 , -0.03998384,
         0.11086792,  0.04178683, -0.04420251,  0.0

In [39]:
# This gives us a list of 6 300-dimensional vectors each representing 
# the difference between various leave and remain tweets. We can use
# numpy's np.mean() functin to get an averaged version of this that 
# we can use to map tweets onto a dimension as Kozlowski did in his 
# article. Set axis to 0 to tell numpy to average vertically and thereby
# keep the 300 dimensions of the document vectors.
import numpy as np
leave_remain = np.mean(all_differences, axis=0)
leave_remain

array([-0.00157456, -0.05217792,  0.0419127 ,  0.04672951,  0.03271747,
       -0.03591959, -0.06676342,  0.12793745,  0.02696978, -0.07790915,
       -0.01495371, -0.061717  , -0.05641   , -0.01134036,  0.09178831,
       -0.0499446 , -0.07951175, -0.00321769,  0.06590649,  0.07055821,
        0.04887135,  0.00762104,  0.00772127,  0.00753059,  0.03128977,
        0.02764205, -0.03774627, -0.06877508, -0.05621868,  0.12549369,
       -0.02654669,  0.04367043, -0.02743331,  0.08200487, -0.01299664,
       -0.0727163 , -0.07999585, -0.00343554,  0.01903851, -0.00779018,
       -0.00813048,  0.05357983, -0.0063859 ,  0.02473996, -0.05586026,
       -0.05338119,  0.14121045,  0.04695305, -0.08191665,  0.03207045,
        0.02674058,  0.00592408, -0.00333192,  0.03248364,  0.00907045,
        0.03861285, -0.01780383,  0.12166525, -0.11691978, -0.02727823,
        0.01993375,  0.05368945, -0.064578  , -0.0064636 ,  0.0178501 ,
        0.1534072 ,  0.03899955, -0.04265344,  0.07913003, -0.05

In [44]:
# (6) Use cosine_similarity to see how each of these tweets is placed on
# this newly created leave-remain dimension.

# For this, we can simply compare all the vectors in doc_vecs to the one
# in leave_remain. However, cosine_similarity expects a certain type of
# input, which is why we have to wrap the leave_remain object in a list.
cosine_similarity(doc_vecs, [leave_remain])

# As we talked about in class, since we subtracted remain from leave:
# + leave - remain
# vectors that are more positively associated with this averaged 
# difference vector are related to the semantic (or syntactic) idea
# of a pro-Brexit discourse (leave), and vectors that are more negatively
# associated with this are related to the semantic (or syntactic) idea
# of an anti-Brexit discourse (remain)

array([[-0.00930596],
       [ 0.02747884],
       [-0.25965503],
       [-0.26627576],
       [-0.44336742]], dtype=float32)

In [45]:
# (7) Use cosine_similarity to see whether this leave-remain dimension is
# sufficiently valid to place the collected tweets of Boris, Theresa, and
# Donald on the right place of this axis.
# Tip: It isn't. But the exercise is useful to see how such a dimension
# projection would be implemented in python ;-)

# Let's repeat the word vector extraction for these joint tweet text strings
# in df_joint
user_vecs = []

for row in range(len(df_joint)):
    doc = nlp(df_joint.loc[row, 'tweet_text'])
    user_vecs.append(doc.vector)
    
user_vecs

[array([-2.51690019e-02,  1.67820632e-01, -8.56930092e-02, -2.47408804e-02,
         7.73024186e-02, -5.46702780e-02,  2.14121770e-02, -5.85045367e-02,
        -1.12713091e-02,  2.04421568e+00, -2.23940551e-01,  2.94953473e-02,
         3.46307084e-02, -4.26832363e-02, -1.16740309e-01, -4.99530174e-02,
        -8.92135054e-02,  9.33650255e-01, -1.47640347e-01,  3.40509713e-02,
         1.92046687e-02, -2.24611200e-02,  3.80156189e-03, -4.08812054e-02,
         2.62281275e-03,  3.80286798e-02, -8.37209150e-02, -2.99177486e-02,
         3.17714103e-02, -3.90791260e-02, -1.28931785e-02,  6.36653677e-02,
        -4.69526388e-02,  5.47716059e-02,  7.12287277e-02, -3.94926369e-02,
         7.70881679e-03,  1.43209714e-02, -1.42360181e-02, -3.97443250e-02,
        -3.61223780e-02,  5.60499951e-02, -2.16709799e-03, -4.56131212e-02,
         1.46303969e-02,  5.81216589e-02, -7.22816810e-02,  3.20975520e-02,
        -2.82105058e-03,  2.63937097e-03, -5.37741417e-03,  5.71525320e-02,
         3.1

In [47]:
# Trying to map the real tweets of Boris, Theresa, and Donald onto
# this leave_remain dimension yields results that do not really align
# with our prior believes about where they should end up. While the 
# variation is slow to begin with, Boris is mapped slightly more on
# the remain side (-0.12) than both Theresa (-0.11) and Donald (-0.09).
cosine_similarity(user_vecs, [leave_remain])

array([[-0.11716276],
       [-0.10989034],
       [-0.08986986]], dtype=float32)

In [None]:
# However, this does not necessary invalidate the general logic of Kozlowsky's
# approach or the idea of applying it into a Brexit-related dimension. If we 
# were to address this issue, we would probably try to find a more convincing
# set of explicitly leave or remain tweets, and try to extract an averaged
# difference vector that is better able to map tweets on this ideological space
# in a way that lives up to our prior believes. 

# Even though word vectors are super intransparent in the way they function,
# you could set this up as a quasi-supervised learning task (even though that
# would probably be the wrong terminology): Hand-code a sample of tweets according
# to this dimension. Use a 'train'-set to create the difference vector, and use the
# test-set to validate whether this difference vector is able to replicate the coding
# in an out-of-sample scenario. If it works, you might have found an approach to code
# map all tweets in our dataset on this rather interesting ideological dimension.