In [1]:
# Dependencies
import numpy as np
import pandas as pd
import string
import re
import stanfordnlp as nlp

In [2]:
# Download english language neural network
nlp.download('en')

Using the default treebank "en_ewt" for language "en".
Would you like to download the models for: en_ewt now? (Y/n)
n


In [3]:
# Setting up the default pipeline
pl = nlp.Pipeline()

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/home/laura/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/home/laura/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/home/laura/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/home/laura/stanfordnlp_resources/en_ewt_models/en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/home/laura/stanfordnlp_resources/en_ewt_models/en_ewt_parser.pt', 'pretrain_path': '/home/laura/stanfordnlp_resources/en_ewt_models/en_ewt.pr

In [4]:
# Load tweets dataset
tweets = pd.read_csv('data/database/tweets.csv', dtype={
    'id_str': str
})

tweets.head()

Unnamed: 0,created_at,id_str,text,truncated,geo,coordinates,place,retweet_count,favourite_count,in_reply_to_status_id_str,in_reply_to_user_id_str,lang
0,Thu Dec 19 23:35:02 +0000 2019,1207806578650468352,These events around the world prompted importa...,True,,,,53,,,,en
1,Thu Dec 19 21:48:10 +0000 2019,1207779681627447296,RT @wef: Women's pay equality has slipped back...,False,,,,0,,,,en
2,Thu Dec 19 21:42:14 +0000 2019,1207778191173201920,Happy birthday @Alyssa_Milano!🎈 Thank you for ...,False,,,,191,,,,en
3,Thu Dec 19 20:35:04 +0000 2019,1207761285045260294,“This is my charge to everyone:\n\nWe have to ...,True,,,,88,,,,en
4,Thu Dec 19 18:41:46 +0000 2019,1207732772451889152,8 Posters At The CAA Protests In Delhi That Sc...,False,,,,7,,,,en


In [5]:
# Define function for cleaning tweets text

# Define punctuation
p = r'{:s}'.format(string.punctuation)

# Actual function definition
def clean_tweet(text):
    # Clean the tweet: remove everything except strings and punctuation
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)  # Remove links
    text = re.sub(r'[^\w\s{:s}]+'.format(p), '', text)  # Remove special characters
    text = re.sub(r'[@#]', ' ', text)
    text = re.sub(r'[\n]+', '', text)  # Remove newlines
    text = re.sub(r'[ ]+', ' ', text)  # Remove double blank spaces
    text = re.sub(r'^[ ]+|[ ]+$', '', text)  # Remove trailing blank spaces
    return(text)

In [7]:
# Define a function for processing text through Stanford NLP
def nlp_process(text, pipeline):
    # Process text through nlp pipeline and retrieve results
    res = pipeline(text)
    # Define a list of words
    words = list()
    # Loop through each retrieved sentence
    for i, sentence in enumerate(res.sentences):
        # Loop through every dependency in the sentence
        for j, word in enumerate(sentence.words):
            # Define a new dictionary for the current word
            word = {'sentence': i,
                    'index': word.index,
                    'text': word.text,
                    'upos': word.upos,
                    'xpos': word.xpos,
                    'governor': word.governor,
                    'dependency': word.dependency_relation}
            # Add word to sentences list
            words.append(word)
    # Return list of words
    return words

In [8]:
# Define a list of words
words = list()
# Loop through every tweet and extract words information throug Stanford NLP
for i, tweet in tweets.iterrows():
    # Get main tweet attributes 
    tweet_text = tweet['text']
    tweet_id = tweet['id_str']
    # Clean tweet text
    tweet_text = clean_tweet(tweet_text)
    # Parse cleaned tweet text: extract features for words
    tweet_words = nlp_process(tweet_text, pipeline=pl)
    # Edit words in current tweet: add tweet id
    for i, word in enumerate(tweet_words):
        # Add tweet id
        tweet_words[i].setdefault('tweet', tweet_id)
    # Add retrieved words to main list
    words += tweet_words

# Show 
words























[{'sentence': 0,
  'index': '1',
  'text': 'These',
  'upos': 'DET',
  'xpos': 'DT',
  'governor': 2,
  'dependency': 'det',
  'tweet': '1207806578650468352'},
 {'sentence': 0,
  'index': '2',
  'text': 'events',
  'upos': 'NOUN',
  'xpos': 'NNS',
  'governor': 6,
  'dependency': 'nsubj',
  'tweet': '1207806578650468352'},
 {'sentence': 0,
  'index': '3',
  'text': 'around',
  'upos': 'ADP',
  'xpos': 'IN',
  'governor': 5,
  'dependency': 'case',
  'tweet': '1207806578650468352'},
 {'sentence': 0,
  'index': '4',
  'text': 'the',
  'upos': 'DET',
  'xpos': 'DT',
  'governor': 5,
  'dependency': 'det',
  'tweet': '1207806578650468352'},
 {'sentence': 0,
  'index': '5',
  'text': 'world',
  'upos': 'NOUN',
  'xpos': 'NN',
  'governor': 2,
  'dependency': 'nmod',
  'tweet': '1207806578650468352'},
 {'sentence': 0,
  'index': '6',
  'text': 'prompted',
  'upos': 'VERB',
  'xpos': 'VBD',
  'governor': 0,
  'dependency': 'root',
  'tweet': '1207806578650468352'},
 {'sentence': 0,
  'index':

In [9]:
# Turn words into Pandas DataFrame objec
words = pd.DataFrame(words)
words.head()

Unnamed: 0,dependency,governor,index,sentence,text,tweet,upos,xpos
0,det,2,1,0,These,1207806578650468352,DET,DT
1,nsubj,6,2,0,events,1207806578650468352,NOUN,NNS
2,case,5,3,0,around,1207806578650468352,ADP,IN
3,det,5,4,0,the,1207806578650468352,DET,DT
4,nmod,2,5,0,world,1207806578650468352,NOUN,NN


In [10]:
# Save Pandas DataFrame object to disk as .csv file
words.to_csv('data/database/words.csv', index=False)