In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import re

import emoji

In [2]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

In [3]:
train = pd.read_csv('nlp-getting-started/train.csv')
test = pd.read_csv('nlp-getting-started/test.csv')

In [4]:
config = {
    'TextPreprocessor': {
        'mode_remove_stops': True,
        'mode_drop_long_words': True,
        'max_size_vocab': 50000,
        'max_doc_freq': 0.9,
        'min_count': 5,
        'pad_word': '<PAD>', 
        'text_column': 'text'
    },
}

In [5]:
regular_expression_map = {
    'url': 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
    'mention': r'(?<=@)\w+',
    'hashtag': '(?<=#)\w+'
}

In [6]:
class TextPreprocessor(object):
    def __init__(self, config):
        """Preparing text features."""

        self._mode_remove_stops = config.get('mode_remove_stops', True)
        self._pad_word = config.get('pad_word', '<PAD>')
        self._text_column = config.get('text_column')

    def _clean_text(self, input_text):
        """Delete special symbols."""

        input_text = input_text.str.lower()
        input_text = input_text.str.replace(r'[^a-z ]+', ' ') 
        input_text = input_text.str.replace(r' +', ' ')
        input_text = input_text.str.replace(r'^ ', '')
        input_text = input_text.str.replace(r' $', '')

        return input_text
    
    def _remove_stop_words(self, input_sentence):
        stop_words = set(stopwords.words('english')) 
        word_tokens = word_tokenize(input_sentence) 
        
        return ' '.join(list(filter(lambda word: word not in stop_words, word_tokens)))

    def _find_url(self, input_sequence, regular_expression): 
        text = re.findall(regular_expression,
                          input_sequence)
        
        url =  "".join(text)
        return 0 if url == '' else 1
    
    def _remove_url(self, input_sequence, regular_expression):

        return re.sub(regular_expression,
                      '',
                      input_sequence)

    def transform(self, df):
        
        # check if the text has url, mention or hashtag
        df['has_url'] = df[self._text_column].apply(
            lambda x: self._find_url(x, regular_expression_map['url'])
        )
        df['has_mention'] = df[self._text_column].apply(
            lambda x: self._find_url(x, regular_expression_map['mention'])
        )
        df['has_hashtag'] = df[self._text_column].apply(
            lambda x: self._find_url(x, regular_expression_map['hashtag'])
        )
        
        # get some counts
        df['text_len'] = df['text'].astype(str).apply(len)
        df['text_counter'] = df['text'].apply(lambda x: len(str(x).split()))
        
        # df[self._text_column] = self._clean_text(df[self._text_column])
        
        # clean text
        df[self._text_column] = df[self._text_column].apply(
            lambda x: self._remove_url(x, regular_expression_map['url'])
        )
        df[self._text_column] = df[self._text_column].apply(
            lambda x: self._remove_url(x, regular_expression_map['mention'])
        )
        df[self._text_column] = df[self._text_column].apply(
            lambda x: self._remove_url(x, regular_expression_map['hashtag'])
        )
        
        
        # remove stop words
        if self._mode_remove_stops:
            df[self._text_column] = df[self._text_column].apply(self._remove_stop_words, 1)
        
        
        return df
  

In [7]:
df_train = TextPreprocessor(config['TextPreprocessor']).transform(train)

In [12]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target,has_url,has_mention,has_hashtag,text_len,text_counter
0,1,,,Our Deeds Reason # May ALLAH Forgive us,1,0,0,1,69,13
1,4,,,Forest fire near La Ronge Sask . Canada,1,0,0,0,38,7
2,5,,,All residents asked 'shelter place ' notified ...,1,0,0,0,133,22
3,6,,,"13,000 people receive # evacuation orders Cali...",1,0,0,1,65,8
4,7,,,Just got sent photo Ruby # smoke # pours school,1,0,0,1,88,16
