Import the modules and load the raw data

In [16]:
%time
import os
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

import numpy as np
import pandas as pd
import sklearn
import textblob
import nltk

from helpers import *

os.environ['KAGGLE_CONFIG_DIR'] = "../.kaggle/"
!kaggle competitions download -c nlp-getting-started
!unzip -n 'nlp-getting-started'

print("Python version:", sys.version)
print("Version info.:", sys.version_info)
print("pandas version:", pd.__version__)
print("numpy version:", np.__version__)
print("skearn version:", sklearn.__version__)

for dirname, _, filenames in os.walk('.'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 4.53 µs
nlp-getting-started.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  nlp-getting-started.zip
Python version: 3.8.5 (default, May 27 2021, 13:30:53) 
[GCC 9.3.0]
Version info.: sys.version_info(major=3, minor=8, micro=5, releaselevel='final', serial=0)
pandas version: 1.2.3
numpy version: 1.19.5
skearn version: 0.24.1
./nlp-getting-started.zip
./test.csv
./helpers.py
./main.py
./sample_submission.csv
./main.ipynb
./train.csv
./__pycache__/helpers.cpython-38.pyc


Load the raw data

In [17]:
train_df = pd.read_csv("train.csv", index_col="id")
train_df.sample(5)

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2075,casualty,"Toledo, OH",Casualty Team: Ice Cream Recall Sends Chill Th...,0
9059,structural%20failure,USA,Virgin galactic crash: early unlocking of brak...,1
5178,fatalities,,Injuries Illnesses and Fatalities Latest Numbe...,1
10702,wreck,,the sunset boys wreck my bed original 1979 u...,1
3074,deaths,,@Eazzy_P we will never know what would have ha...,1


Convert the text to lower case

In [18]:
train_df["text_clean"] = train_df["text"].apply(lambda x: x.lower())
train_df[1:2]

Unnamed: 0_level_0,keyword,location,text,target,text_clean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask. canada


Convert contractions to non-contraction form (eg. I'd -> I had)

In [19]:
train_df["text_clean"] = train_df["text_clean"].apply(lambda x: contractions.fix(x))
train_df[67:68]


Unnamed: 0_level_0,keyword,location,text,target,text_clean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
96,accident,CLVLND,'I can't have kids cuz I got in a bicycle acci...,0,'i can not have kids cuz i got in a bicycle ac...


Remove any URLs from the text

In [20]:
train_df["text_clean"] = train_df["text_clean"].apply(lambda x: remove_URL(x))
train_df[197:198]

Unnamed: 0_level_0,keyword,location,text,target,text_clean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
277,ambulance,L. A.,http://t.co/pWwpUm6RBj Twelve feared killed in...,1,twelve feared killed in pakistani air ambulan...


Remove HTML tags

In [21]:
train_df["text_clean"] = train_df["text_clean"].apply(lambda x: remove_html(x))
train_df[62:63]

Unnamed: 0_level_0,keyword,location,text,target,text_clean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
89,ablaze,Twitter Lockout in progress,Rene Ablaze &amp; Jacinta - Secret 2k13 (Falle...,0,rene ablaze jacinta - secret 2k13 (fallen ski...


Remove non-ASCII

In [22]:
train_df["text_clean"] = train_df["text_clean"].apply(lambda x: remove_non_ascii(x))
train_df[38:39]

Unnamed: 0_level_0,keyword,location,text,target,text_clean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
56,ablaze,,Barbados #Bridgetown JAMAICA ÛÒ Two cars set ...,1,barbados #bridgetown jamaica two cars set abl...


Remove special characters

In [23]:
train_df["text_clean"] = train_df["text_clean"].apply(lambda x: remove_special_characters(x))
train_df[143:144]

Unnamed: 0_level_0,keyword,location,text,target,text_clean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
205,airplane%20accident,Hyderabad Telangana INDIA,Horrible Accident Man Died In Wings of Airpla...,1,horrible accident man died in wings of airpla...


Remove punctuation

In [24]:
train_df["text_clean"] = train_df["text_clean"].apply(lambda x: remove_punct(x))
train_df[5:6]

Unnamed: 0_level_0,keyword,location,text,target,text_clean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8,,,#RockyFire Update => California Hwy. 20 closed...,1,rockyfire update california hwy 20 closed in ...


Clean the rest

In [25]:
%%time
train_df["text_clean"] = train_df["text_clean"].apply(lambda x: other_clean(x))
train_df[1844:1845]

CPU times: user 1.05 s, sys: 0 ns, total: 1.05 s
Wall time: 1.05 s


Unnamed: 0_level_0,keyword,location,text,target,text_clean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2651,crashed,Buenos Aires,MH370: Intact part lifts odds plane glided not...,1,malaysia airlines flight 370 intact part lifts...


Remove spelling errors

In [26]:
# %%time
# train_df["text_clean"] = train_df["text_clean"].apply(lambda x: textblob.TextBlob(x).correct())

Break words into a list

In [27]:
%%time
train_df['tokenized'] = train_df['text_clean'].apply(nltk.tokenize.word_tokenize)

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/PY3/english.pickle[0m

  Searched in:
    - '/home/jbrunner/nltk_data'
    - '/home/jbrunner/git-repos/kaggle/venv/nltk_data'
    - '/home/jbrunner/git-repos/kaggle/venv/share/nltk_data'
    - '/home/jbrunner/git-repos/kaggle/venv/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************


In [28]:
%%time
train_df['stopwords_removed'] = train_df['tokenized'].apply(lambda x: [word for word in x if word not in nltk.corpus.stop])

KeyError: 'tokenized'

Try to break words down to their root (ie. stemming)

In [29]:
%%time
train_df['stemmer'] = train_df['stopwords_removed'].apply(lambda x: stemmer(x))


KeyError: 'stopwords_removed'

Look at the data

In [30]:
train_df.sample(10)

Unnamed: 0_level_0,keyword,location,text,target,text_clean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
9943,trouble,,Love how I don't get in any trouble for having...,0,love how i do not get in any trouble for havin...
175,aftershock,,That moment when you get on a scary roller coa...,0,that moment when you get on a scary roller coa...
8203,riot,,To All The Meat-Loving Feminists Of The World ...,0,to all the meatloving feminists of the world r...
156,aftershock,US,320 [IR] ICEMOON [AFTERSHOCK] | http://t.co/vA...,0,320 ir icemoon aftershock djicemoon dubste...
1032,bleeding,,you could slit my throat and I'd apologize for...,0,you could slit my throat and I would apologize...
2144,catastrophe,,#nar #phuket Ultimate #preparedness library: h...,1,nar phuket ultimate preparedness library prep...
6758,lightning,"Rapid City, Black Hills, SD",NWS says thunderstorms with deadly lightning w...,1,nws says thunderstorms with deadly lightning w...
8490,screamed,with Doflamingo,//kinda screamed &gt;_&lt; https://t.co/MSUY4q...,0,kind of screamed
8522,screaming,Jariana Town,@justinbieber @ArianaGrande Can you hear me sc...,0,justinbieber arianagrande can you hear me scre...
8654,sinkhole,NY,Gaping sinkhole opens up in Brooklyn New York ...,1,gaping sinkhole opens up in brooklyn new york
