Import the modules and load the raw data

In [191]:
%time
import os
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

import numpy as np
import pandas as pd
import sklearn
import nltk

nltk.download('punkt')
nltk.download('stopwords')
from nltk import word_tokenize
from nltk.corpus import stopwords

from textblob import Blobber
from textblob.sentiments import NaiveBayesAnalyzer
text_blob = Blobber(analyzer=NaiveBayesAnalyzer())

from utils import *
from ..helpers import *

os.environ['KAGGLE_CONFIG_DIR'] = "../.kaggle/"
# !kaggle competitions download -c nlp-getting-started
# !unzip -n 'nlp-getting-started'

print("Python version:", sys.version)
print("Version info.:", sys.version_info)
print("pandas version:", pd.__version__)
print("numpy version:", np.__version__)
print("skearn version:", sklearn.__version__)

for dirname, _, filenames in os.walk('.'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 6.2 µs


[nltk_data] Downloading package punkt to /home/jbrunner/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jbrunner/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


ImportError: attempted relative import with no known parent package

Load the raw data

In [None]:
train_df = load_csv_data(filename="train.csv", id="id")
# train_df = pd.read_csv("train.csv", index_col="id")
train_df.head(5)
save_data_as_csv(train_df)
train_df = load_csv_data(filename="cleaned_train.csv", id="id")
train_df.head(5)


Convert the text to lower case

In [None]:
train_df["text_clean"] = train_df["text"].apply(lambda x: x.lower())
train_df[1:2]

Convert contractions to non-contraction form (eg. I'd -> I had)

In [None]:
train_df["text_clean"] = train_df["text_clean"].apply(lambda x: contractions.fix(x))
train_df[67:68]


Remove any URLs from the text

In [None]:
train_df["text_clean"] = train_df["text_clean"].apply(lambda x: remove_URL(x))
train_df[197:198]

Remove HTML tags

In [None]:
train_df["text_clean"] = train_df["text_clean"].apply(lambda x: remove_html(x))
train_df[62:63]

Remove non-ASCII

In [None]:
train_df["text_clean"] = train_df["text_clean"].apply(lambda x: remove_non_ascii(x))
train_df[38:39]

Remove special characters

In [None]:
train_df["text_clean"] = train_df["text_clean"].apply(lambda x: remove_special_characters(x))
train_df[143:144]

Remove punctuation

In [None]:
train_df["text_clean"] = train_df["text_clean"].apply(lambda x: remove_punct(x))
train_df[5:6]

Clean the rest

In [None]:
%%time
train_df["text_clean"] = train_df["text_clean"].apply(lambda x: other_clean(x))
train_df[1844:1845]

Remove spelling errors

In [None]:
%%time
train_df["text_clean"] = train_df["text_clean"].apply(lambda x: text_blob(x).correct())
print("Null values...")
train_df.text.isnull().sum()
train_df["text_clean"] = train_df["text_clean"].fillna("")


Break words into a list

In [None]:
%%time
train_df['tokenized'] = train_df['text_clean'].apply(word_tokenize)

In [None]:
%%time
stop = set(stopwords.words('english'))
train_df['stopwords_removed'] = train_df['tokenized'].apply(lambda x: [word for word in x if word not in stop])

Try to break words down to their root (ie. stemming)

In [None]:
%%time
train_df['stemmer'] = train_df['stopwords_removed'].apply(lambda x: stemmer(x))


Look at the data

In [None]:
train_df.sample(10)