In [None]:
##########################################################################
# author: patricewangen
# created: 21 February 2020
# last_edited: 25 February 2020
##########################################################################

# TODO
# (1) Homework Solutions
# (2) Document-Term Matrices
# (3) Pre-Processing: Tokenizing, Removing-Stuff, Stemming
# (4) Pairwise Cosine Similarity Scores

In [228]:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Exercise 03
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (1) Browse through the raw Twitter object, try to understand its 
# structure, and extract the following information about the status 
# update: "user_id", "user_handle", "user_loc", "user_desc", "tweet_text", 
# "tweet_id", "tweet_time"

# For this, we need some functionalities from the json package. Let's 
# load it into our current python session
import json

# Let's open the JSON file with the 25.000 Twitter objects. First, we
# read it into our python session as a simple text file.
json_data = open("DATA/2019-12-06_16-43-32.json").read()

# Then we use the json.loads() function to recognize the python-like
# json structures encoded into this string. In this case, it should
# return a list ([]) of strings ("")
json_data = json.loads(json_data)

In [229]:
# Each of these strings contains a raw Twitter object that is again
# encoded into python-like json structures. We keep these as strings
# because the complex nested structures of the raw Twitter object
# would slow python down. So, if we want to process these Tweets, let's
# do so one by one. Let's look at the first one (list index = 0):
tweet = json.loads(json_data[0])
tweet

{'created_at': 'Fri Dec 06 14:24:28 +0000 2019',
 'id': 1202956978286452738,
 'id_str': '1202956978286452738',
 'text': 'RT @jeremycorbyn: If some accuse me of talking to both sides in the Brexit debate then so be it. I’m proud of it.\n\nWhy would I only want to…',
 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
 'truncated': False,
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'in_reply_to_screen_name': None,
 'user': {'id': 2496109354,
  'id_str': '2496109354',
  'name': 'robert',
  'screen_name': 'rb218702',
  'location': 'Northampton, England',
  'url': None,
  'description': 'British liberal, artist, check out my instagram page rob_burch_arts.',
  'translator_type': 'none',
  'protected': False,
  'verified': False,
  'followers_count': 230,
  'friends_count': 614,
  'listed_count': 2,
  'favourites_count': 30169,
  'statuses_count': 40775,
  'cre

In [230]:
# Once we used the json package to convert this string into a 
# python-like data structure, we see that we are dealing with
# a complex and nested dictionary that we can subset with the
# tools we learned in the last classes.
# We are supposed to find the following information about this
# Twitter object: "user_id", "user_handle", "user_loc", 
# "user_desc", "tweet_text", "tweet_id", "tweet_time"

# Here are the solutions
print("The user_id is: " + tweet['user']['id_str']) # Or user str(tweet['id']) to convert an integer object into a string we can use with print()
print("The user_handle is: " + tweet['user']['screen_name'])
print("The user_loc(ation) is: " + tweet['user']['location']) # note that this is self-declared and not based on GPS
print("The user_desc(ription) is: " + tweet['user']['description']) 
print("The tweet_text is: " + tweet['text']) # see exercise 3 for further complications to this
print("The tweet_id is: " + tweet['id_str'])
print("The tweet_time is: " + tweet['created_at'])

The user_id is: 2496109354
The user_handle is: rb218702
The user_loc(ation) is: Northampton, England
The user_desc(ription) is: British liberal, artist, check out my instagram page rob_burch_arts.
The tweet_text is: RT @jeremycorbyn: If some accuse me of talking to both sides in the Brexit debate then so be it. I’m proud of it.

Why would I only want to…
The tweet_id is: 1202956978286452738
The tweet_time is: Fri Dec 06 14:24:28 +0000 2019


In [231]:
# (2) Check out the time package and try to convert Twitter's time 
# signature into the format "29/05/2019 07:04"

# Let's load the time package into our current python session
import time

# It has two functions that are relevant here:
# - strptime() takes a string with date-time information and 
# creates a standardized date-time object (another object type 
# next to lists, dictionaries, etc.)
# - strftime() takes a data-time object and creates a string
# version of the date-time according to your own specification

# Let's store the time-string from the Twitter dictionary into a
# separate object for processing.
tweet_time = tweet['created_at']
tweet_time

'Fri Dec 06 14:24:28 +0000 2019'

In [232]:
# The following creates a structured date-time object based on
# where we tell the function to look for each specific information
# in the string. We do so by comparing the string structure with
# the documentation of the strptime function in the time package:
# https://docs.python.org/3/library/time.html
tweet_time = time.strptime(tweet_time,'%a %b %d %H:%M:%S +0000 %Y')
tweet_time

time.struct_time(tm_year=2019, tm_mon=12, tm_mday=6, tm_hour=14, tm_min=24, tm_sec=28, tm_wday=4, tm_yday=340, tm_isdst=-1)

In [233]:
# Now we can turn this structured time object into a new string that
# lives up to whatever we need:
tweet_time_str = time.strftime("%Y-%m-%d %H:%M:%S", tweet_time)
tweet_time_str

'2019-12-06 14:24:28'

In [234]:
# You can use the structured time objects in python if you want to 
# do, e.g., time-series analysis or plot a timeline in python. If you
# move to another programm (R or STATA), you might want to turn this 
# into the respectively useful string format. Alternatively, you could 
# also use this to aggregate Tweets per day:
tweet_data_str = time.strftime("%Y-%m-%d", tweet_time)
tweet_data_str

'2019-12-06'

In [235]:
# (3) Go through multiple Twitter objects and try to understand the 
# inconsistencies in which Tweet texts are stored depending on Tweet type
# and text length.

# The tweet text is a bit of a complicated story. Usually, you 
# will be able to find the text in tweet['text']... (see exercise 3 
# for further complications)
print("The tweet_text is: " + tweet['text']) 

# But there are some exceptions to be wary about:
# - Longer texts: If the tweet text exceeds a certain length, 
# you will find the non-abbreviated text in tweet['extended_tweet']['full_text']

# - Retweets: You will find the full non-abbreviated original Tweet text
# in tweet['retweeted_status']['text'], unless the original Tweet
# was a longer text, in which case it is the same story as above 
# tweet['retweeted_status']['extended_tweet']['full_text']

# - Quotes: If we are dealing with a quoted text, the tweet['text']
# refers to the comment by the user, but you can get the original
# quoted text with tweet['quoted_status']['text'] or 
# tweet['quoted_status']['extended_tweet']['full_text'] depending on 
# the length of the quoted tweet. 

# You can check with which type of Twitter object you are dealing with
# by using some nested conditional questions:
tweet_type = "Tweet"
if 'quoted_status' in tweet:
    tweet_type = "Quote"
if 'retweeted_status' in tweet:
    tweet_type = "Retweet"
    if 'quoted_status' in tweet:
        tweet_type = "Re_Quote"
            
print("\nMost likely, we are dealing with a: " + tweet_type)
print("\nSo the full length original tweet is: \n " + tweet['retweeted_status']['extended_tweet']['full_text'])


The tweet_text is: RT @jeremycorbyn: If some accuse me of talking to both sides in the Brexit debate then so be it. I’m proud of it.

Why would I only want to…

Most likely, we are dealing with a: Retweet

So the full length original tweet is: 
 If some accuse me of talking to both sides in the Brexit debate then so be it. I’m proud of it.

Why would I only want to talk to half the country? I don’t want to live in half a country.

A prime minister must talk and listen to everyone - and bring our divided country together.


In [236]:
# (4) Create a new pandas dataframe to store the information extracted in
# the first task. Try to create a pandas dataframe without any content
# a.k.a. an empty dataframe. This will be your master dataframe to which
# you append information from Twitter objects row-by-row. (call it "df")

# Let's load the pandas package into our python session under the name pd
import pandas as pd

# Let's define the relevant columns for these exercises (later one, you
# might add columns depending on what you want to extract from the raw
# Twitter data)
selected_cols = ["user_id", "user_handle", "user_loc", "user_desc", 
                 "tweet_text", "tweet_id", "tweet_time"]

# Now, let's create a DataFrame that does not contain any data to which
# we will add a new row for each Twitter object we process. To create
# a dataframe with 0 rows, we can use an empty list ([]) as the first
# argument of the DataFrame() function
df = pd.DataFrame([], columns=selected_cols)
df

Unnamed: 0,user_id,user_handle,user_loc,user_desc,tweet_text,tweet_id,tweet_time


In [237]:
# (5) Write a for-loop that runs through the first 100 Twitter objects 
# from the JSON data, converts the string into a dictionary, and prints 
# the tweet_text for each. 
# Tipp: Check our if-else statements to ensure that you extract the text 
# reliably for each Tweet format.

# Let's loop through the first 100 elements in the json_data list, and
# do some stuff for each Twitter object
for ix in range(0, 100):
    # First, let's turn the string into a dictionary that we can query
    # for relevant information
    tweet = json.loads(json_data[ix])
    
    # Now let's print the tweet text, and make sure we get the 
    # extended version in case the text is too long...
    
    if 'extended_tweet' in tweet: # If you find the key 'extended_tweet' in the tweet dictionary, do the following
        print(tweet['extended_tweet']['full_text'])
    else: # If you don't find the key 'extended_tweet' in the tweet dictionary, do this instead
        print(tweet['text'])
    

RT @jeremycorbyn: If some accuse me of talking to both sides in the Brexit debate then so be it. I’m proud of it.

Why would I only want to…
RT @LaboursBlackPLP: This is massive, Former Tory Prime Minister will not be voting Tory and neither should you.

John Major breaks Tory ra…
RT @faisalislam: table in leaked Government presentation shows extraordinary new Irish Sea checks on the cards as a result of PMs Brexit de…
RT @Conservatives: "This is a Brexit election after all – and a vote for @BorisJohnson this time around is a vote to #GetBrexitDone"

🌳🗳 #V…
@KLbils @BiztheBuz @NickBoles @jeremycorbyn Please bear in mind that if brexit is the biggest issue for you, you are extremely privileged.
RT @DavidLammy: Evidence that @BorisJohnson is lying again and doing what he previously said he would never accept. Putting a border down t…
RT @LeaveEUOfficial: In a letter to the anti-Semite, Boris blasts Corbyn's "sly attempt to undermine the result of the 2016 referendum" by…
RT @jeremycorbyn

In [238]:
# (6) Extend this loop to create a new pandas dataframe with the same 
# columns as "df" and one row with np.nan for each column. (call it 
# "new_row")

import numpy as np

# First, let's define some empty data for the same columns as in the
# df DataFrame, which we can use to store the respective information
# for each Twitter object (see 03_Tasting.ipynb)
empty_data = {col: [np.nan] for col in selected_cols}

for ix in range(0, 100):
    # Get the dictionary of the Twitter object
    tweet = json.loads(json_data[ix])
    
    # Create an DataFrame with one empty row
    new_row = pd.DataFrame(empty_data)
    
new_row

Unnamed: 0,user_id,user_handle,user_loc,user_desc,tweet_text,tweet_id,tweet_time
0,,,,,,,


In [239]:
# (7) Extend this loop to fill in the cells for each new Tweet and append
# the result to the "df" dataframe.
empty_data = {col: [np.nan] for col in selected_cols}

for ix in range(0, 100):
    tweet = json.loads(json_data[ix])
    new_row = pd.DataFrame(empty_data)
    
    if 'extended_tweet' in tweet: 
        new_row.loc[0, "tweet_text"] = tweet['extended_tweet']['full_text']
    else: 
        new_row.loc[0, "tweet_text"] = tweet['text']
    
    new_row.loc[0, "user_id"] = tweet['user']['id_str']
    new_row.loc[0, "user_handle"] = tweet['user']['screen_name']
    new_row.loc[0, "user_loc"] = tweet['user']['location']
    new_row.loc[0, "user_desc"] = tweet['user']['description']
    new_row.loc[0, "tweet_id"] = tweet["id_str"]


    # For fun's sake, let's apply what we learned in exercise 2
    tweet_time = tweet['created_at']
    tweet_time = time.strptime(tweet_time,'%a %b %d %H:%M:%S +0000 %Y')
    new_row.loc[0, "tweet_time"] = time.strftime("%Y-%m-%d %H:%M:%S", tweet_time)
    
    # Now, in order to save each newly extracted row, append it to the master 
    # DataFrame created in exercise 4. Use the ignore_index option to ensure
    # a clean indexing of the master DataFrame df.
    df = df.append(new_row, ignore_index=True)
df

Unnamed: 0,user_id,user_handle,user_loc,user_desc,tweet_text,tweet_id,tweet_time
0,2496109354,rb218702,"Northampton, England","British liberal, artist, check out my instagra...",RT @jeremycorbyn: If some accuse me of talking...,1202956978286452738,2019-12-06 14:24:28
1,496433273,david707x,"Newport, Wales",http://Gov.UK/registertovote\n#RemainAlliance\...,"RT @LaboursBlackPLP: This is massive, Former T...",1202956979284647938,2019-12-06 14:24:28
2,3390733695,AndrewHemmingt2,,,RT @faisalislam: table in leaked Government pr...,1202956979838304260,2019-12-06 14:24:28
3,269708883,ferrier3,paisley,singer sometimes!,"RT @Conservatives: ""This is a Brexit election ...",1202956980949786627,2019-12-06 14:24:28
4,1038400204305850368,fran_oneill_s,"Leeds, England",Feminist. Humanist. Cyclist. Likes kindness & ...,@KLbils @BiztheBuz @NickBoles @jeremycorbyn Pl...,1202956981767720961,2019-12-06 14:24:28
...,...,...,...,...,...,...,...
95,1822726884,MatthewGreen02,,"Director, Green Planning Studio @greenplanning...",RT @mrjamesob: Two former PMs to join Final Sa...,1202957080090599426,2019-12-06 14:24:52
96,124313779,Untidy_mind,UK,,RT @Simon_Nixon: An astonishing moment. https:...,1202957080333828098,2019-12-06 14:24:52
97,431515838,mclaren_joanne,,,RT @AngusRobertson: Interesting to see that ne...,1202957080669425666,2019-12-06 14:24:52
98,104141401,angegarrod,,"#BlockTheCoup Arty farty. Artist, photographe...","RT @MatthewGreen02: If, 4 years ago, someone s...",1202957082254860292,2019-12-06 14:24:52


In [240]:
# (8) After this loop, save "df" on your disk in the feather format.
df.to_feather("DATA/processed_tweets.feather")

In [66]:
# (9) Try to process all 25.000 Twitter objects with this loop. 
# Tipp: If you run into troubles, manually check out the Twitter object
# that breaks the loop to ensure you're looking for the information at
# the right place in the dictionary.

# First let's load all the different packages that we need for this
# process
import pandas as pd
import numpy as np
import json
import time

# Then, recreate the master DataFrame that we want to store the
# processed data in:
selected_cols = ["user_id", "user_handle", "user_loc", "user_desc", 
                 "tweet_text", "tweet_id", "tweet_time"]
df = pd.DataFrame([], columns=selected_cols)
empty_data = {col: [np.nan] for col in selected_cols}

# Let's open the JSON batch of 25.000 tweets
json_data = open("DATA/2019-12-06_16-43-32.json").read()
json_data = json.loads(json_data)

# Now, loop through the list of json-formatted Twitter objects,
# extract the information we need, and add rows to the main 
# DataFrame for each Tweet.
for ix in range(0, len(json_data)):
    tweet = json.loads(json_data[ix])
    new_row = pd.DataFrame(empty_data)
    
    # EXTRACTION
    # Non problematic information
    new_row.loc[0, "user_id"] = tweet['user']['id_str']
    new_row.loc[0, "user_handle"] = tweet['user']['screen_name']
    new_row.loc[0, "user_loc"] = tweet['user']['location']
    new_row.loc[0, "user_desc"] = tweet['user']['description']
    new_row.loc[0, "tweet_id"] = tweet["id_str"]
    
    if 'extended_tweet' in tweet: 
        new_row.loc[0, "tweet_text"] = tweet['extended_tweet']['full_text']
    else: 
        new_row.loc[0, "tweet_text"] = tweet['text']
    
    tweet_time = tweet['created_at']
    tweet_time = time.strptime(tweet_time,'%a %b %d %H:%M:%S +0000 %Y')
    new_row.loc[0, "tweet_time"] = time.strftime("%Y-%m-%d %H:%M:%S", tweet_time)
    
    df = df.append(new_row, ignore_index=True)
    
    # In order to check how quickly or slowly you computer is handling this
    # let's just print something at every 100 Tweets processed:
    if ix%1000 == 0: # If the remainder of dividing ix by 100 is equal to 0, do the following
        print("Processed: " + str(ix) + " of " + str(len(json_data)))
        
df.to_feather("DATA/processed_tweets.feather")

Processed: 0 of 25000
Processed: 1000 of 25000
Processed: 2000 of 25000
Processed: 3000 of 25000
Processed: 4000 of 25000
Processed: 5000 of 25000
Processed: 6000 of 25000
Processed: 7000 of 25000
Processed: 8000 of 25000
Processed: 9000 of 25000
Processed: 10000 of 25000
Processed: 11000 of 25000
Processed: 12000 of 25000
Processed: 13000 of 25000
Processed: 14000 of 25000
Processed: 15000 of 25000
Processed: 16000 of 25000
Processed: 17000 of 25000
Processed: 18000 of 25000
Processed: 19000 of 25000
Processed: 20000 of 25000
Processed: 21000 of 25000
Processed: 22000 of 25000
Processed: 23000 of 25000
Processed: 24000 of 25000


In [68]:
# (10) Check out how to write functions in python, and write this process
# into a function that takes the string Twitter object, converts it into
# a dictionary, etc. and outputs the new_row pandas dataframe. You should
# be able to run the following for-loop executing everything from the 
# previous exercises:
# for tweet in range(0, len(json_data)):
#     new_row = process_raw(tweet)
#     df.append(new_row)

# (11) Try to find a way to time how long your computer takes to calculate
# each of these loops. Is the short version with the function quicker?
# Can you think of ways to speed this up? Why is it taking so long?

In [241]:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Week 4: Natural Language Processing
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# For the purpose of the NLP sessions, we will be working with a 
# small set of Brexit Tweets from the users eucopresident, 
# BorisJohnson, and theresa_may. The data for this was extracted 
# using the process_tweet function you can find in the 
# 04_processing.py script I uploaded to Absalon. There you can
# also find the CSV and feather versions of this dataset, which
# was taken directly from DIPLOFACE's SQL server.

# Let's load the usual packages
import pandas as pd
import numpy as np

# pandas has an inbuild function to read feather files, but 
# depending on you package version, this sometimes gives you 
# some error messages. If that happens, a quick fix is to use
# the feather package directly
import feather
df = feather.read_dataframe("DATA/love-triangle.feather")
df

Unnamed: 0,ID,json_file,json_pos,processed_at,user_id,user_handle,user_loc,user_desc,tweet_text,tweet_id,...,qu_user_desc,qu_tweet_text,qu_tweet_id,qu_tweet_time,qu_tweet_geo,qu_tweet_country,qu_tweet_loc,qu_tweet_loc_type,qu_tweet_hashtags,qu_tweet_mentions
0,1259863,2019-05-06_12-11-56,8979,2019-09-19 11:47:00,3131144855,BorisJohnson,London,MP for Uxbridge and South Ruislip,,1125328351974105088,...,,,0,1900-01-01 00:00:00,False,,,,,
1,3794418,2019-09-05_21-10-31,1624,2019-09-19 14:07:00,3131144855,BorisJohnson,United Kingdom,Prime Minister of the United Kingdom and @Cons...,Corbyn and his friends in Parliament don’t tru...,1169681044573962240,...,,,0,1900-01-01 00:00:00,False,,,,,
2,3895193,2019-06-19_18-33-12,8378,2019-09-19 14:13:00,3131144855,BorisJohnson,United Kingdom,MP for Uxbridge and South Ruislip #BackBoris,,1141362273941958666,...,,,0,1900-01-01 00:00:00,False,,,,,
3,5512364,2019-08-19_09-25-10,17164,2019-09-19 15:44:00,3131144855,BorisJohnson,United Kingdom,Prime Minister of the United Kingdom and @Cons...,,1163346035718205440,...,,,0,1900-01-01 00:00:00,False,,,,,
4,5908998,2019-06-15_17-58-59,24268,2019-09-19 16:06:00,3131144855,BorisJohnson,United Kingdom,MP for Uxbridge and South Ruislip #BackBoris,Fantastic to address our party faithful at the...,1139923957296111617,...,,,0,1900-01-01 00:00:00,False,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,70388493,2019-08-27_19-22-41,9680,2019-09-23 00:43:00,3131144855,BorisJohnson,United Kingdom,Prime Minister of the United Kingdom and @Cons...,Jeremy Corbyn wants to cancel the referendum a...,1166391520062300160,...,,,0,1900-01-01 00:00:00,False,,,,,
153,70751519,2019-08-29_13-36-31,7660,2019-09-23 01:13:00,3131144855,BorisJohnson,United Kingdom,Prime Minister of the United Kingdom and @Cons...,,1167030319775735811,...,,,0,1900-01-01 00:00:00,False,,,,,
154,71742418,2019-09-09_14-02-14,6016,2019-09-23 02:27:00,3131144855,BorisJohnson,United Kingdom,Prime Minister of the United Kingdom and @Cons...,Let’s come together and get Brexit done on Oct...,1171024527410814976,...,,,0,1900-01-01 00:00:00,False,,,,,
155,72721638,2019-06-28_18-02-17,17833,2019-09-23 03:40:00,3131144855,BorisJohnson,United Kingdom,MP for Uxbridge and South Ruislip #BackBoris,Thank you @JSHeappey for the invitation to spe...,1144626979045629952,...,,,0,1900-01-01 00:00:00,False,,,,,


In [242]:
# Note that missing values are now specified with None instead of
# np.nan – Both are fine to handle, but sometimes, you might prefer
# one over the other as the standard for missing values. You can
# easily change this with the following line
df.fillna(value=np.nan, inplace=True)
df

Unnamed: 0,ID,json_file,json_pos,processed_at,user_id,user_handle,user_loc,user_desc,tweet_text,tweet_id,...,qu_user_desc,qu_tweet_text,qu_tweet_id,qu_tweet_time,qu_tweet_geo,qu_tweet_country,qu_tweet_loc,qu_tweet_loc_type,qu_tweet_hashtags,qu_tweet_mentions
0,1259863,2019-05-06_12-11-56,8979,2019-09-19 11:47:00,3131144855,BorisJohnson,London,MP for Uxbridge and South Ruislip,,1125328351974105088,...,,,0,1900-01-01 00:00:00,False,,,,,
1,3794418,2019-09-05_21-10-31,1624,2019-09-19 14:07:00,3131144855,BorisJohnson,United Kingdom,Prime Minister of the United Kingdom and @Cons...,Corbyn and his friends in Parliament don’t tru...,1169681044573962240,...,,,0,1900-01-01 00:00:00,False,,,,,
2,3895193,2019-06-19_18-33-12,8378,2019-09-19 14:13:00,3131144855,BorisJohnson,United Kingdom,MP for Uxbridge and South Ruislip #BackBoris,,1141362273941958666,...,,,0,1900-01-01 00:00:00,False,,,,,
3,5512364,2019-08-19_09-25-10,17164,2019-09-19 15:44:00,3131144855,BorisJohnson,United Kingdom,Prime Minister of the United Kingdom and @Cons...,,1163346035718205440,...,,,0,1900-01-01 00:00:00,False,,,,,
4,5908998,2019-06-15_17-58-59,24268,2019-09-19 16:06:00,3131144855,BorisJohnson,United Kingdom,MP for Uxbridge and South Ruislip #BackBoris,Fantastic to address our party faithful at the...,1139923957296111617,...,,,0,1900-01-01 00:00:00,False,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,70388493,2019-08-27_19-22-41,9680,2019-09-23 00:43:00,3131144855,BorisJohnson,United Kingdom,Prime Minister of the United Kingdom and @Cons...,Jeremy Corbyn wants to cancel the referendum a...,1166391520062300160,...,,,0,1900-01-01 00:00:00,False,,,,,
153,70751519,2019-08-29_13-36-31,7660,2019-09-23 01:13:00,3131144855,BorisJohnson,United Kingdom,Prime Minister of the United Kingdom and @Cons...,,1167030319775735811,...,,,0,1900-01-01 00:00:00,False,,,,,
154,71742418,2019-09-09_14-02-14,6016,2019-09-23 02:27:00,3131144855,BorisJohnson,United Kingdom,Prime Minister of the United Kingdom and @Cons...,Let’s come together and get Brexit done on Oct...,1171024527410814976,...,,,0,1900-01-01 00:00:00,False,,,,,
155,72721638,2019-06-28_18-02-17,17833,2019-09-23 03:40:00,3131144855,BorisJohnson,United Kingdom,MP for Uxbridge and South Ruislip #BackBoris,Thank you @JSHeappey for the invitation to spe...,1144626979045629952,...,,,0,1900-01-01 00:00:00,False,,,,,


In [243]:
# (2) Document-Term Matrices
# Let's focus on the tweet_text variable for now, and filter
# out all rows without a text. Let's also keep the user handle
# so we can later on compare how these four politicians tweet
# about the topic.
df = df.loc[df['tweet_text'].notna(), ['user_handle', 'tweet_text']]

# Make sure to reset the index to avoid confusion down the line...
df = df.reset_index(drop=True)
df

Unnamed: 0,user_handle,tweet_text
0,BorisJohnson,Corbyn and his friends in Parliament don’t tru...
1,BorisJohnson,Fantastic to address our party faithful at the...
2,theresa_may,You want this stage of the Brexit process to b...
3,eucopresident,EU27 unanimously agrees on its response to UK’...
4,BorisJohnson,I’m deeply honoured to have secured more than ...
...,...,...
91,BorisJohnson,I’m standing to be Leader of the Conservative ...
92,BorisJohnson,Jeremy Corbyn wants to cancel the referendum a...
93,BorisJohnson,Let’s come together and get Brexit done on Oct...
94,BorisJohnson,Thank you @JSHeappey for the invitation to spe...


In [263]:
# If we want to turn the column 'tweet_text' into a
# document-term matrix, we can simply use the sklearn
# package that should come pre-installed with your Anaconda
# distribution. Either we use the Tfidf or Count Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Let's start with the simple CountVectorizer and create a 
# DTM using the inbuild tokenizer.

# Notice, there is something slightly odd about the name of 
# this imported thing. Rather than count_vectorizer, it's 
# spelled CountVectorizer. You can take this as a hint that 
# you did notimport a specific function, but something slightly
# different.
# What we imported is a more general object called "class", 
# which is a template for creating new objects that contain
# specific attributes and methods (see also the StreamListener 
# situation in the StreamingAPI script). With this template,
# we create a vectorizer object, on which we can now call
# certain methods.
vectorizer = CountVectorizer()
vectorizer.fit_transform

<bound method CountVectorizer.fit_transform of CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)>

In [245]:
# fit_transform returns the DTM in a sparse matrix format
# from numpy that is extremely computationally efficient. 
sparse_dtm = vectorizer.fit_transform(df['tweet_text'])

# But for the sake of illustration, let's turn this into
# a nice pandas DataFrame, which works fine with such a
# small amount of documents and tokens (or features).
tokens = vectorizer.get_feature_names()
dtm = pd.DataFrame(data=sparse_dtm.toarray(), 
                   index=df.index,
                   columns=tokens)
dtm

Unnamed: 0,0kxjwwsprm,0w7ghgviel,10,100,150,16,200,2019,31st,3ke6f1fgx0,...,yet,ygrsfessfy,yorkshire,you,young,your,yykczinjbv,yzobcftvjd,zgb6dfhbhd,zvudfp7mon
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
92,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
93,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
94,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0


In [246]:
# So, this DTM has 96 rows (documents, in this case Tweets),
# and 790 columns (features/tokens/variables). The CountVectorizer
# looks at all the unique tokens it can find across all the
# documents. It automatically uses a very simple tokenizer
# for this. Check out the documentation to see whether you
# can find out how its tokenizer splits texts into individual
# tokens. Let's look at the outcome, and see whether we can
# improve on this crude first take:
tokens

['0kxjwwsprm',
 '0w7ghgviel',
 '10',
 '100',
 '150',
 '16',
 '200',
 '2019',
 '31st',
 '3ke6f1fgx0',
 '3pypnuvpyp',
 '3vrdupnwhs',
 '42y3hi5z8p',
 '4jinkgtzyc',
 '4lj0whityp',
 '50',
 '596iosh01u',
 '7jydiszdjb',
 '8000',
 '8gkvhwud55',
 '8vbg3jz6dk',
 '8vcdlajean',
 '9sdjciimxl',
 '9vi8oqqjgj',
 'aada8qvd1x',
 'about',
 'accept',
 'across',
 'address',
 'after',
 'afternoon',
 'again',
 'agenda',
 'agree',
 'agreed',
 'agreement',
 'agrees',
 'ahead',
 'all',
 'also',
 'alternatives',
 'although',
 'altogether',
 'always',
 'am',
 'amazing',
 'amp',
 'an',
 'and',
 'andrejplenkovic',
 'another',
 'anti',
 'anyone',
 'appeal',
 'approach',
 'april',
 'are',
 'argue',
 'around',
 'art',
 'as',
 'asked',
 'asking',
 'aspects',
 'at',
 'avoid',
 'b02wiljds2',
 'b3luadnfjw',
 'back',
 'backboris',
 'backing',
 'backstop',
 'bad',
 'ballot',
 'basz4qx36s',
 'bbi0kc6cdg',
 'be',
 'become',
 'been',
 'before',
 'begin',
 'begins',
 'being',
 'belfast',
 'believe',
 'believing',
 'benches',
 '

In [1]:
# (3) Pre-Processing: Tokenizing, Removing-Stuff, Stemming
# As we can see, there is a bunch of weird stuff in there, and
# some tokens should be counted as one, which we can achieve
# by pre-processing techniques like stemming (getting rid of
# suffixes etc.). 

# There are many different packages to do this, and I hope
# that Jurafsky and Martin convinced you that there are 
# different computational approaches to pre-processing, most
# of which will give you different results. For this session,
# we will stick to a collection of tools provided by the 
# NLTK (Natural Language Tool Kit) package. This is kind of
# a hub of different techniques that comes in handy. Besides
# installing nltk via pip (google "install nltk package Windows/Mac"),
# you will als need to download individual packages. There
# are two ways to do so. Either you try to run the code, and
# let NLTK tell you which things you need to download to
# run a specific functionality (the error messages will
# provide precise instructions), or you just install all
# of their functionalities at once via the command line 
# interface. I prefer the latter, but be aware that this
# requires up to 4GB storage space on your computer. 
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [249]:
# Let's do all the pre-processing on a single tweet first,
# so we can have a look at the individual changes as they
# happen to the text.
tweet = df.loc[0, 'tweet_text']
tweet

'Corbyn and his friends in Parliament don’t trust you to make this decision - but I do. Let’s put it to the people: more delay with Corbyn’s #SurrenderBill, or Brexit delivered on October 31st ???? https://t.co/q8tIwDMkcH'

In [250]:
# [3a] Tokenizing
# The most simple way to tokenize a given text is to use the 
# python-internal string function split(), which we can
# call on a given string object. It simply splits the string
# into individual tokens at every whitespace it encounters.
tokens = tweet.split()
tokens

['Corbyn',
 'and',
 'his',
 'friends',
 'in',
 'Parliament',
 'don’t',
 'trust',
 'you',
 'to',
 'make',
 'this',
 'decision',
 '-',
 'but',
 'I',
 'do.',
 'Let’s',
 'put',
 'it',
 'to',
 'the',
 'people:',
 'more',
 'delay',
 'with',
 'Corbyn’s',
 '#SurrenderBill,',
 'or',
 'Brexit',
 'delivered',
 'on',
 'October',
 '31st',
 '????',
 'https://t.co/q8tIwDMkcH']

In [253]:
# There is a bunch of problems with this, which have
# to do with the punctuation that is directly linked
# to a word and not separated by whitespace. Plenty
# of people have worked to solve such issues, and the
# easy-to-use alternative that you see the most is
# the word_tokenize function from NLTK. Let's import it:
from nltk.tokenize import word_tokenize
tokens = word_tokenize(tweet)
tokens

['Corbyn',
 'and',
 'his',
 'friends',
 'in',
 'Parliament',
 'don',
 '’',
 't',
 'trust',
 'you',
 'to',
 'make',
 'this',
 'decision',
 '-',
 'but',
 'I',
 'do',
 '.',
 'Let',
 '’',
 's',
 'put',
 'it',
 'to',
 'the',
 'people',
 ':',
 'more',
 'delay',
 'with',
 'Corbyn',
 '’',
 's',
 '#',
 'SurrenderBill',
 ',',
 'or',
 'Brexit',
 'delivered',
 'on',
 'October',
 '31st',
 '?',
 '?',
 '?',
 '?',
 'https',
 ':',
 '//t.co/q8tIwDMkcH']

In [264]:
# This looks slightly better in that it recognized more
# common English language style separation of two words
# like in "don't" = "do not" – However, it also has a 
# weird understanding of URLs and separated the hashtag
# from the word in #SurrenderBill. We might want to 
# keep this as the hashtag is part of the tokens underlying
# meaning in Twitter communication.

# In order to find out whether there is a tokenizer 
# more appropriate for our context, we can have a look
# at the documentation of the nltk tokenize section
# https://www.nltk.org/api/nltk.tokenize.html 

# And voilà, there is a tokenizer specifically 
# developped for parsing tweets. Again, we can import the
# general class, create an instance of this class, and
# then call certain methods from this instance.
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()
tokens = tokenizer.tokenize(tweet)
tokens

['Corbyn',
 'and',
 'his',
 'friends',
 'in',
 'Parliament',
 'don',
 '’',
 't',
 'trust',
 'you',
 'to',
 'make',
 'this',
 'decision',
 '-',
 'but',
 'I',
 'do',
 '.',
 'Let',
 '’',
 's',
 'put',
 'it',
 'to',
 'the',
 'people',
 ':',
 'more',
 'delay',
 'with',
 'Corbyn',
 '’',
 's',
 '#SurrenderBill',
 ',',
 'or',
 'Brexit',
 'delivered',
 'on',
 'October',
 '31st',
 '?',
 '?',
 '?',
 'https://t.co/q8tIwDMkcH']

In [265]:
# [3b] Lowercasing
# Now that we have individual tokens, we can easilyapply more 
# pre-processing techniques to each token with list-comprehension.
# Turning every character to lowercasing is super easy in python,
# and uncontroversial for once.
tokens = [word.lower() for word in tokens]
tokens

['corbyn',
 'and',
 'his',
 'friends',
 'in',
 'parliament',
 'don',
 '’',
 't',
 'trust',
 'you',
 'to',
 'make',
 'this',
 'decision',
 '-',
 'but',
 'i',
 'do',
 '.',
 'let',
 '’',
 's',
 'put',
 'it',
 'to',
 'the',
 'people',
 ':',
 'more',
 'delay',
 'with',
 'corbyn',
 '’',
 's',
 '#surrenderbill',
 ',',
 'or',
 'brexit',
 'delivered',
 'on',
 'october',
 '31st',
 '?',
 '?',
 '?',
 'https://t.co/q8tiwdmkch']

In [276]:
# [3c] Punctuation Removal
# There are a bunch of approaches to this, but let's
# use the string package, which has a lot of other
# cool features
import string

# It contains a list of the most common punctuation 
# characters
punct = string.punctuation
punct

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [278]:
# Problem is that this list contains the # symbol,
# which we do want to keep, so let's replace this
punct = punct.replace("#", "")
punct

'!"$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [286]:
# Now, we can use list comprehension to drop all
# the punctuation tokens in our list of tokens
tokens = [word for word in tokens if word not in punct]
tokens

['corbyn',
 'and',
 'his',
 'friends',
 'in',
 'parliament',
 'don',
 '’',
 't',
 'trust',
 'you',
 'to',
 'make',
 'this',
 'decision',
 'but',
 'i',
 'do',
 'let',
 '’',
 's',
 'put',
 'it',
 'to',
 'the',
 'people',
 'more',
 'delay',
 'with',
 'corbyn',
 '’',
 's',
 '#surrenderbill',
 'or',
 'brexit',
 'delivered',
 'on',
 'october',
 '31st',
 'https://t.co/q8tiwdmkch']

In [287]:
# As we see, this didn't remove the ’ – let's just add it to punct,
# and repeat the process
punct = punct + "’"
punct

'!"$%&\'()*+,-./:;<=>?@[\\]^_`{|}~’'

In [288]:
# Now we are left with only alphanumerical characters
# and we managed to not throw away the hashtag sign.
# in the process.
tokens = [word for word in tokens if word not in punct]
tokens

['corbyn',
 'and',
 'his',
 'friends',
 'in',
 'parliament',
 'don',
 't',
 'trust',
 'you',
 'to',
 'make',
 'this',
 'decision',
 'but',
 'i',
 'do',
 'let',
 's',
 'put',
 'it',
 'to',
 'the',
 'people',
 'more',
 'delay',
 'with',
 'corbyn',
 's',
 '#surrenderbill',
 'or',
 'brexit',
 'delivered',
 'on',
 'october',
 '31st',
 'https://t.co/q8tiwdmkch']

In [289]:
##########################################################################################
# 3d Number Removal
# If
"1942".isdigit()
# any(char.isdigit() for char in inputString)

True

In [227]:
# 3e Stopword Removal
# We can use NLTK's standard stoplist for English
stop_words = stopwords.words('english')

stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]

# 3f Stemming
# Let's use some different stemmers now. 
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

from nltk import SnowballStemmer
stemmer = SnowballStemmer("english")

words = [porter.stem(w) for w in words]

In [None]:
# Let's write a loop that applies all of this to 
# each tweet in the dataframe and creates a new 'processed'
# variable. that we can turn into a DTM now.

In [208]:
# (4) Pairwise cosine Similarity scores:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(sparse_dtm[0,], sparse_dtm[1,])

In [None]:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Exercise 04
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (1) Unit of analysis challenge: Single tweets lead to zero-inflated
# DTMs, use the join() function to collapse all tweets from each user
# into a single string object. You should end with a list of three
# large strings.

# (2) Process them above using the word_tokenizer, lowercasing,
# punctuation removal, number removal, stopword removal, and
# stemming with the PorterStemmer.

# (3) Turn this into a DTM for these three documents (3 rows)

# (4) Our prior believe is that BorisJohnson is more similar to 
# theresa_may than to eucopresident in terms of their Brexit tweets.
# Do the pairwise cosine similarity scores confirm that prior believe?

# (5) Now repeat exercises 2 to 5, but use the Twitter tokenizer 
# instead. Is there a significant difference in the outcome? Which
# tokenizer should I use if I want to analyze these tweets?

# (6) What happens to these cosine similarity scores if I use the 
# Tfidf Vectorizer instead of the CountVectorizer from sklearn?
