In [1]:


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from string import punctuation
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords as sw
from nltk.tokenize import TweetTokenizer
import nltk
import re
import string
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

FEATURE_LIST = ['created_at', 'id', 'full_text', 'user', 'retweet_count', 'favorite_count', 'coordinates', 'place', 'class']

stopwords = sw.words('english') + ["'d", "'ll", "'re", "'s", "'ve", 'doe', 'ha', "n't", 'sha', 'wa', 'wo']

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:

training_set = pd.read_json('development.jsonl', lines=True)
training_set = training_set[FEATURE_LIST]

test_set = pd.read_json('evaluation.jsonl', lines=True)

In [8]:
def transform(text):

    if pd.isnull(text):
        return ''

    if type(text) != str or text=='':
        return ''

    # Clean the text
    text = re.sub("\'s", " ", text) # we have cases like "Sam is" or "Sam's" (i.e. his) these two cases aren't separable, I choose to compromise are kill "'s" directly
    text = re.sub(" whats ", " what is ", text, flags=re.IGNORECASE)
    text = re.sub("\'ve", " have ", text)
    text = re.sub("can't", "can not", text)
    text = re.sub("n't", " not ", text)
    text = re.sub("i'm", "i am", text, flags=re.IGNORECASE)
    text = re.sub("\'re", " are ", text)
    text = re.sub("\'d", " would ", text)
    text = re.sub("\'ll", " will ", text)
    text = re.sub("e\.g\.", " eg ", text, flags=re.IGNORECASE)
    text = re.sub("b\.g\.", " bg ", text, flags=re.IGNORECASE)
    text = re.sub("(\d+)(kK)", " \g<1>000 ", text)
    text = re.sub("e-mail", " email ", text, flags=re.IGNORECASE)
    text = re.sub("(the[\s]+|The[\s]+)?U\.S\.A\.", " America ", text, flags=re.IGNORECASE)
    text = re.sub("(the[\s]+|The[\s]+)?United State(s)?", " America ", text, flags=re.IGNORECASE)
    text = re.sub("\(s\)", " ", text, flags=re.IGNORECASE)
    text = re.sub("[c-fC-F]\:\/", " disk ", text)

    text = re.sub("#{2,3}", "#", text)
    text = re.sub("#2020prot.*", "#2020protest", text, flags=re.IGNORECASE)
    text = re.sub("#abolish.*police", "#abolishpolice", text, flags=re.IGNORECASE)
    text = re.sub("#acab.*", "#", text, flags=re.IGNORECASE)
    text = re.sub("#ahmauda.*y", "#ahmaudaubrey", text, flags=re.IGNORECASE)
    text = re.sub(".*black_*li[f,v]e[s]*_*m\w*", "#blacklivesmatter", text, flags=re.IGNORECASE)
    text = re.sub(".*al{1,4}_*i[f,v]e[s]*_*m\w*", "#alllivesmatter", text, flags=re.IGNORECASE)


    # remove comma between numbers, i.e. 15,000 -> 15000
    text = re.sub('(?<=[0-9])\,(?=[0-9])', "", text)

    punct = punctuation.replace('@','').replace('#', '')
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punct])

    # Return a list of words
    return text

class CustomTokenizer(object):
    def __init__(self, tokenizer=TweetTokenizer(preserve_case=False)):
        self.tokenizer = tokenizer

    def preprocess(self, tweet):
        re_digit = re.compile("[0-9]") # regular expression to filter digit tokens
        re_emoji = re.compile("["
         u"\U0001F600-\U0001F64F"  # emoticons
         u"\U0001F300-\U0001F5FF"  # symbols & pictographs
         u"\U0001F680-\U0001F6FF"  # transport & map symbols
         u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
         u"\U00002702-\U000027B0"
         u"\U000024C2-\U0001F251"
         "]+", flags=re.UNICODE)

        tweet = transform(tweet)
        tokens = self.tokenizer.tokenize(tweet)
        transformed_tweet = []

        for token in tokens:
            if  re_emoji.match(token) or (
            token not in punctuation
            and len(token) > 1
            ):
                transformed_tweet.append(token)

        return transformed_tweet

In [10]:
X_train = []
tweets = list(training_set["full_text"])

tokenizer = CustomTokenizer()

for t in tweets:
    X_train.append(tokenizer.preprocess(t))

X_train[0]

['@bawdzisnaughty',
 '@sgfgjay',
 '@radioshadilay',
 'you',
 'are',
 'little',
 'behind',
 'the',
 'curve',
 'wednesday',
 'wednesday']