In [58]:
import pandas as pd
import numpy as np
from pathlib import Path
import os
import re
from unicodedata import normalize
import string
import pickle as pkl
import os
import sys
# Disable all warning include tensorflow gpu debug
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

from sklearn.feature_extraction.text import CountVectorizer

import nltk
words = set(nltk.corpus.words.words())
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer() 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import math

In [67]:
def cleaning(tweet_text, df):
    temp = []
    table = str.maketrans("", "", string.punctuation)
    for tweet in tweet_text:
        # Remove links
        tweet = re.sub(r"http\S+", "", tweet)
        # Remove newline
        tweet = tweet.strip('\n')
        # Remove unicode
        tweet = normalize('NFKD', tweet).encode('ascii','ignore')
        # Remove username
        tweet = re.sub('@[^\s]+','',str(tweet))
        # Remove punctuation and change to lower case
        tweet = tweet.translate(table).lower()
        # Remove 'b' at the begining for binary
        tweet = tweet.replace('b', '', 1)
#         # lemmatize
#         tweet = " ".join(lemmatizer.lemmatize(w) for w in word_tokenize(tweet))
        # Remove non english
        tweet = " ".join(w for w in nltk.wordpunct_tokenize(tweet) if w.lower()
            in words or not w.isalpha())
        temp.append(tweet)
    try:
        # Concatenate training with target
        processed_tweets = pd.concat([pd.DataFrame(temp), df['target']], axis=1)
        processed_tweets = pd.DataFrame(processed_tweets)
    except KeyError:
        processed_tweets = pd.DataFrame(temp)
#     print(processed_tweets)
    return processed_tweets

In [68]:
path = Path('.').parent.absolute()
print('=== Reading data from {} ==='.format(path))
full_train = os.path.join(path, 'raw-dataset', 'train.csv')
train_df = pd.read_csv(full_train, encoding='utf-8')

full_test = os.path.join(path, 'raw-dataset', 'test.csv')
test_df = pd.read_csv(full_test, encoding='utf-8')

print('=== Cleaning texts ===')
# Preprocess training and testing tweets
processed_tr_tweets = cleaning(train_df['text'], train_df)
processed_tst_tweets = cleaning(test_df['text'], test_df)

print(processed_tr_tweets.head())

=== Reading data from C:\Users\myins\Desktop\Programming\Programming-Challenge\kaggle-submission\disaster-tweets ===
=== Cleaning texts ===
                                                   0  target
0  our are the reason of this earthquake may forg...       1
1                         forest fire near la canada       1
2  all to shelter in place are being notified by ...       1
3                 13000 people receive evacuation in       1
4  just got sent this photo from ruby as smoke fr...       1


In [87]:
def vectorize_tweets(tokenizer, data):
    tokenizer.fit_on_texts(data)
    words = {word: index for word,
             index in sorted(tokenizer.word_index.items(),
                             key=lambda item: item[1], reverse=True)}
    print(len(words))
    print(words)
    vect_tweets = tokenizer.texts_to_sequences(data)
    vect_tweets = pad_sequences(vect_tweets)
    return vect_tweets, tokenizer, words

In [88]:
print('=== Tokenizing texts ===')
# Convert a collection of text documents to a matrix of token counts
tokenizer = Tokenizer()
# Combine both train and test
# Prevent unequal length of variables after tokenization
combined_tr_tst = pd.concat([processed_tr_tweets[0],
                                processed_tst_tweets[0]], axis=0
                            )
combined_vect, _, words = vectorize_tweets(tokenizer, combined_tr_tst)

=== Tokenizing texts ===
8790


In [89]:
len(words)

8790