In [7]:
# Standard modules
import numpy as np
import pandas as pd
import string
import re

# Twitter tagger APIs
from modules.CMUTweetTagger import runtagger_parse

In [2]:
# Constants

# Path to POS tagger java application
ARK_TWEET_NLP_PATH = 'java -XX:ParallelGCThreads=2 -Xmx500m -jar resources/ark-tweet-nlp-0.3.2/ark-tweet-nlp-0.3.2.jar'

# Path to words dataset .csv file
WORDS_PATH = 'data/database/words.csv'

In [3]:
# Load tweets dataset
tweets = pd.read_csv('data/database/tweets.csv', dtype={
    'id_str': str
})

# Show few lines of the dataset
tweets.head()

Unnamed: 0,created_at,id_str,text,truncated,geo,coordinates,place,retweet_count,favourite_count,in_reply_to_status_id_str,in_reply_to_user_id_str,lang
0,Thu Dec 19 23:35:02 +0000 2019,1207806578650468352,These events around the world prompted importa...,True,,,,53,,,,en
1,Thu Dec 19 21:48:10 +0000 2019,1207779681627447296,RT @wef: Women's pay equality has slipped back...,False,,,,0,,,,en
2,Thu Dec 19 21:42:14 +0000 2019,1207778191173201920,Happy birthday @Alyssa_Milano!🎈 Thank you for ...,False,,,,191,,,,en
3,Thu Dec 19 20:35:04 +0000 2019,1207761285045260294,“This is my charge to everyone:\n\nWe have to ...,True,,,,88,,,,en
4,Thu Dec 19 18:41:46 +0000 2019,1207732772451889152,8 Posters At The CAA Protests In Delhi That Sc...,False,,,,7,,,,en


In [None]:
# Define function to expand contractions 

def expand_contractions(text):
    # import contractions dict
    from modules.contractions import contractions_dict
    
    contractions_pattern = re.compile('({})'.format('|'.join(contractions_dict.keys())),
                                      flags=re.IGNORECASE | re.DOTALL)

    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contractions_dict.get(match) \
            if contractions_dict.get(match) \
            else contractions_dict.get(match.lower())
        expanded_contraction = expanded_contraction
        return expanded_contraction

    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

In [4]:
# Define wrapper function for POS tagging words in tweets
def pos_tag(tweets):
    return runtagger_parse(tweets, run_tagger_cmd=ARK_TWEET_NLP_PATH)

In [6]:
# Tag each tweet in dataset
tweets.loc[:,'text'] = tweets.text.apply(expand_contractions,axis = 1)
tagged_tweets = pos_tag(tweets.loc[:, 'text'].values)
tagged_tweets[:5]

[[]]

In [6]:
# Turn each tuple into a dictionary

# Define new list containing tagged words
tagged_words = list()
# Loop through each tagged tweet
for i, tagged_tweet in enumerate(tagged_tweets):
    # Get current tweet id
    tweet_id = tweets.loc[i, 'id_str']
    # Loop through each tag in current tweet
    for j, tag in enumerate(tagged_tweet):
        # Overwrite tag tuple with word dictionary
        tagged_word = {
            'tweet': tweet_id, # Id of tweet containing word
            'index': j, # Word index in sentence
            'text': tag[0], # Actual word text
            'pos': tag[1], # Part Of Speech tag
            'conf': tag[2], # Confidence for POS tag
        }
        # Update tagged words list
        tagged_words.append(tagged_word)
    
# Show first 5 tagged tweets objects
tagged_words[:5]

[{'tweet': '1207806578650468352',
  'index': 0,
  'text': 'These',
  'pos': 'D',
  'conf': 0.9796},
 {'tweet': '1207806578650468352',
  'index': 1,
  'text': 'events',
  'pos': 'N',
  'conf': 0.9975},
 {'tweet': '1207806578650468352',
  'index': 2,
  'text': 'around',
  'pos': 'P',
  'conf': 0.9893},
 {'tweet': '1207806578650468352',
  'index': 3,
  'text': 'the',
  'pos': 'D',
  'conf': 0.9995},
 {'tweet': '1207806578650468352',
  'index': 4,
  'text': 'world',
  'pos': 'N',
  'conf': 0.9613}]

In [7]:
# Turn tagged words list into a Pandas DataFrame object
words = pd.DataFrame(tagged_words)
words.head()

Unnamed: 0,tweet,index,text,pos,conf
0,1207806578650468352,0,These,D,0.9796
1,1207806578650468352,1,events,N,0.9975
2,1207806578650468352,2,around,P,0.9893
3,1207806578650468352,3,the,D,0.9995
4,1207806578650468352,4,world,N,0.9613


In [8]:
# Sort by tweet id and word index
words.sort_values(by=['tweet', 'index'], ascending=True, inplace=True)
words.head(50)

Unnamed: 0,tweet,index,text,pos,conf
11751,1204199429177315329,0,RT,~,0.9979
11752,1204199429177315329,1,@unwomenjordan,@,0.9991
11753,1204199429177315329,2,:,~,0.963
11754,1204199429177315329,3,From,P,0.9991
11755,1204199429177315329,4,the,D,0.9975
11756,1204199429177315329,5,right,A,0.4388
11757,1204199429177315329,6,to,P,0.9871
11758,1204199429177315329,7,equal,A,0.7391
11759,1204199429177315329,8,pay,N,0.8749
11760,1204199429177315329,9,for,P,0.9951


In [9]:
# Output to file
words.to_csv(WORDS_PATH, index=False)