In [1]:
import pandas as pd
import numpy as np
import matplotlib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from spellchecker import SpellChecker
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from collections import defaultdict
import string
import re

STOPWORDS = set(stopwords.words('english'))
pd.set_option('display.max_colwidth', -1)

In [2]:
train = pd.read_csv('data/train_pre_processing.csv')
test = pd.read_csv('data/test_pre_processing.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 54 columns):
keyword                              7552 non-null object
location                             5080 non-null object
text                                 7613 non-null object
keyword_grouped                      7552 non-null object
text_contain_keyword                 7613 non-null bool
total_words                          7613 non-null int64
total_upper_chars                    7613 non-null int64
total_numbers_chars                  7613 non-null int64
total_special_chars                  7613 non-null int64
contain_question                     7613 non-null bool
contain_link                         7613 non-null bool
contain_hashtag                      7613 non-null bool
contain_upper_words                  7613 non-null bool
total_3_words                        7613 non-null int64
total_4_words                        7613 non-null int64
total_5_words                        76

## Text Cleaning

In [3]:
def remove_url(text):
    text = re.compile(r'https?://\S+|www\.\S+').sub(r' ', text)
    text = re.compile(r'http\S+').sub(r'', text)
    text = re.compile(r'www\S+').sub(r'', text)
    text = re.compile(r'pic.twitter.com\S+').sub(r' ', text)
    return text

In [4]:
def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r' ',text)

In [5]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [6]:
def remove_punctuation(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

In [7]:
def remove_stopwords(text):
    words = word_tokenize(text)
    words = [w for w in words if not w in STOPWORDS]
    return ' '.join(words)

In [8]:
spell = SpellChecker()
def spellcheck(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)

In [9]:
def clean_non_ascii(text):
    return re.compile(r'[^A-Za-z0-9\.\'!\?,\$]').sub(r' ', text)

In [10]:
def clean_spaces(text):
    # Reemplazar multiples espacios en uno
    return re.sub('\s{2,}', ' ', text)

In [11]:
def clean_text(text):
    text = remove_url(text)
    text = remove_html(text)
    text = remove_emoji(text)
    text = remove_punctuation(text)
    text = remove_stopwords(text)
    text = clean_non_ascii(text)
    text = clean_spaces(text)
#    text = spellcheck(text)
    return text

In [12]:
train['text_clean'] = train.text.transform(lambda x: clean_text(str(x)))
test['text_clean'] = test.text.transform(lambda x: clean_text(str(x)))

In [13]:
train.text_clean.sample(5)

7141    shoes Asics GTII Super Red 20 11 Ronnie Fieg Kith Red White 3M x gel grey volcano 2               
1987    fitness Knee Damage Solution                                                                      
7256    I stand alone dont piss moan choices made If I must reap whirlwind Ill demeanor calm staid        
5942    I screamed fuck hond                                                                              
4943    ltmeltdown proportions commences I manage calm long enough turn waters hot wait steam cloud vision
Name: text_clean, dtype: object

## Mean Word Length

In [14]:
train['mean_word_len'] = train.text.transform(lambda x: np.mean([len(word) for word in str(x).split()]))
test['mean_word_len'] = test.text.transform(lambda x: np.mean([len(word) for word in str(x).split()]))

## N-gram

In [15]:
def get_ngrams(text, n_gram=1):
    token = []
    words = word_tokenize(text)
    for w in words:
        filter_words = STOPWORDS.union(set(string.punctuation))
        if w not in filter_words:
            token.append(w)
    ngrams = zip(*[token[i:] for i in range(n_gram)])
    return [' '.join(ngram) for ngram in ngrams]

## Bigrams

In [16]:
disaster_bigrams = defaultdict(int)
nondisaster_bigrams = defaultdict(int)

for tweet in train.loc[train['target'] == 1]['text_clean']:
    for word in get_ngrams(tweet, 2):
        disaster_bigrams[word] += 1
        
for tweet in train.loc[train['target'] == 0]['text_clean']:
    for word in get_ngrams(tweet, 2):
        nondisaster_bigrams[word] += 1

In [17]:
# Nos mantenemos con los que tengan una frecuencia mayor o igual a 10 en desastre
delete_keys = [k for k, v in disaster_bigrams.items() if v < 10]
for key in delete_keys:
    del disaster_bigrams[key]

delete_keys = [k for k, v in nondisaster_bigrams.items() if v < 10]
for key in delete_keys:
    del nondisaster_bigrams[key]

In [18]:
# Feature booleano para saber si el texto tiene un bigrama de desastre o no

train['text_disaster_bigram'] = train.text_clean.transform(lambda x: any(disngram in disaster_bigrams.keys() for disngram in get_ngrams(x, 2)))
train['text_nondisaster_bigram'] = train.text_clean.transform(lambda x: any(disngram in nondisaster_bigrams.keys() for disngram in get_ngrams(x, 2)))
test['text_disaster_bigram'] = test.text_clean.transform(lambda x: any(disngram in disaster_bigrams.keys() for disngram in get_ngrams(x, 2)))
test['text_nondisaster_bigram'] = test.text_clean.transform(lambda x: any(disngram in nondisaster_bigrams.keys() for disngram in get_ngrams(x, 2)))

In [19]:
train['text_disaster_bigram'].value_counts()

False    6626
True     987 
Name: text_disaster_bigram, dtype: int64

In [20]:
train['text_nondisaster_bigram'].value_counts()

False    6693
True     920 
Name: text_nondisaster_bigram, dtype: int64

## Trigrams

In [21]:
disaster_trigrams = defaultdict(int)
nondisaster_trigrams = defaultdict(int)

for tweet in train.loc[train['target'] == 1]['text_clean']:
    for word in get_ngrams(tweet, 3):
        disaster_trigrams[word] += 1
        
for tweet in train.loc[train['target'] == 0]['text_clean']:
    for word in get_ngrams(tweet, 2):
        nondisaster_trigrams[word] += 1

In [22]:
# Nos mantenemos con los que tengan una frecuencia mayor o igual a 3
delete_keys = [k for k, v in disaster_trigrams.items() if v < 3]
for key in delete_keys:
    del disaster_trigrams[key]

delete_keys = [k for k, v in nondisaster_trigrams.items() if v < 3]
for key in delete_keys:
    del nondisaster_trigrams[key]

In [23]:
# Feature booleano para saber si el texto tiene un bigrama de desastre o no

train['text_disaster_trigram'] = train.text_clean.transform(lambda x: any(disngram in disaster_trigrams.keys() for disngram in get_ngrams(x, 3)))
train['text_nondisaster_trigram'] = train.text_clean.transform(lambda x: any(disngram in nondisaster_trigrams.keys() for disngram in get_ngrams(x, 2)))
test['text_disaster_trigram'] = test.text_clean.transform(lambda x: any(disngram in disaster_trigrams.keys() for disngram in get_ngrams(x, 3)))
test['text_nondisaster_trigram'] = test.text_clean.transform(lambda x: any(disngram in nondisaster_trigrams.keys() for disngram in get_ngrams(x, 2)))

In [24]:
train['text_disaster_trigram'].value_counts()

False    6408
True     1205
Name: text_disaster_trigram, dtype: int64

In [25]:
train['text_nondisaster_trigram'].value_counts()

False    5306
True     2307
Name: text_nondisaster_trigram, dtype: int64

## TF IDF

In [26]:
vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    norm='l2',
    min_df=0,
    smooth_idf=False,
    max_features=5000)
X = vectorizer.fit_transform(train['text_clean'])

In [27]:
df_tfidf = pd.DataFrame(X.todense(), columns = vectorizer.get_feature_names())
df_tfidf.drop(labels = ['location','text', 'target'], axis=1, inplace=True)
train = train.join(df_tfidf)
print(train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Columns: 5057 entries, keyword to zss
dtypes: bool(21), float64(5000), int64(31), object(5)
memory usage: 292.7+ MB
None


In [30]:
X = vectorizer.transform(test['text_clean'])
df_tfidf = pd.DataFrame(X.todense(), columns = vectorizer.get_feature_names())
df_tfidf.drop(labels = ['id','location','text', 'target'], axis=1, inplace=True)
test = test.join(df_tfidf)
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Columns: 5056 entries, id to zss
dtypes: bool(21), float64(4999), int64(31), object(5)
memory usage: 125.4+ MB
None


In [31]:
target = train.target
train.drop(columns=['target'], inplace=True)
train['target'] = target
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Columns: 5057 entries, keyword to target
dtypes: bool(21), float64(5000), int64(31), object(5)
memory usage: 292.7+ MB


In [32]:
train.to_csv('data/train_pre_processing_nlp_5000.csv', index=False)
test.to_csv('data/test_pre_processing_nlp_5000.csv', index=False)