In [4]:
import pandas as pd
import numpy as np
from string import punctuation
import re
import nltk
from nltk.corpus import stopwords

In [5]:
df = pd.read_csv('./data/vocab_words.csv')
dct = {k:v for (k, v) in zip(df.iloc[:, 0], df['0'])}

In [6]:
website_pattern = r'(https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}[-a-zA-Z0-9()@:%_+.~#?&/=]*)'
username_pattern = '@(\w{1,15})'

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')
stopwords = stopwords.words('english')

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/setone/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/setone/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/setone/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/setone/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [63]:
def tokenize_tweet(text):
    '''
    preprocess the tweet to input into model
    '''
    
    # make texts lowercase
    text = text.lower()
    
    # remove websites and usernames, if exist
    text = re.sub(website_pattern, '', text)
    text = re.sub(username_pattern, '', text)
    
    # remove stop words
    text = ' '.join(text.lower() for text in text.split() if text not in stopwords)
    
    # remove punctuation
    text = ''.join([x for x in text if x not in punctuation])
    
    # lemmatize words to base
    text = [lemmatizer.lemmatize(x) for x in w_tokenizer.tokenize(text)]
    
    # tokenize
    text_int = []
    text_int.append([dct[word] for word in text])
    
    return text_int

In [64]:
# test

tokenize_tweet('hello how are you this is cool moon')

[[2908, 1265, 1972]]

In [72]:
def pad_features(tweets_int, seq_length=26):
    ''' Return features of tweets_ints, where each tweet is 
    padded with 0's or truncated to the input seq_length.
    '''
    features = np.zeros((len(tweets_int), seq_length), dtype = int)
    
    for i, twt in enumerate(tweets_int):
        tweets_len = len(twt)
        
        if tweets_len <= seq_length:
            zeroes = list(np.zeros(seq_length-tweets_len))
            new = zeroes+twt
        elif tweets_len > seq_length:
            new = twt[0:seq_length]
        
        features[i,:] = np.array(new)
    
    return features

In [74]:
# test
pad_features(tokenize_tweet('hello how are you this is cool moon'))

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0, 2908, 1265, 1972]])