In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from spellchecker import SpellChecker
from string import punctuation
import pandas as pd
import nltk
import contractions
import re
import pickle

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/daniyarkurmanbayev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/daniyarkurmanbayev/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
train.shape

(7613, 5)

In [4]:
train_texts = [train.at[i, 'text'] for i in range(train.shape[0])]
train_texts[:5]

['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
 'Forest fire near La Ronge Sask. Canada',
 "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected",
 '13,000 people receive #wildfires evacuation orders in California ',
 'Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school ']

In [5]:
test_texts = [test.at[i, 'text'] for i in range(test.shape[0])]
test_texts[:5]

['Just happened a terrible car crash',
 'Heard about #earthquake is different cities, stay safe everyone.',
 'there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all',
 'Apocalypse lighting. #Spokane #wildfires',
 'Typhoon Soudelor kills 28 in China and Taiwan']

In [6]:
texts = train_texts

In [7]:
def remove_url(text):
    return re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', " ", text)

In [8]:
texts = [remove_url(text) for text in texts]

In [9]:
texts_tokenized = [word_tokenize(text) for text in texts]
texts_tokenized[:5]

[['Our',
  'Deeds',
  'are',
  'the',
  'Reason',
  'of',
  'this',
  '#',
  'earthquake',
  'May',
  'ALLAH',
  'Forgive',
  'us',
  'all'],
 ['Forest', 'fire', 'near', 'La', 'Ronge', 'Sask', '.', 'Canada'],
 ['All',
  'residents',
  'asked',
  'to',
  "'shelter",
  'in',
  'place',
  "'",
  'are',
  'being',
  'notified',
  'by',
  'officers',
  '.',
  'No',
  'other',
  'evacuation',
  'or',
  'shelter',
  'in',
  'place',
  'orders',
  'are',
  'expected'],
 ['13,000',
  'people',
  'receive',
  '#',
  'wildfires',
  'evacuation',
  'orders',
  'in',
  'California'],
 ['Just',
  'got',
  'sent',
  'this',
  'photo',
  'from',
  'Ruby',
  '#',
  'Alaska',
  'as',
  'smoke',
  'from',
  '#',
  'wildfires',
  'pours',
  'into',
  'a',
  'school']]

In [10]:
texts_without_stopwords = [[word for word in text if word not in stopwords.words('english')] for text in texts_tokenized]
texts_without_stopwords[:5]

[['Our',
  'Deeds',
  'Reason',
  '#',
  'earthquake',
  'May',
  'ALLAH',
  'Forgive',
  'us'],
 ['Forest', 'fire', 'near', 'La', 'Ronge', 'Sask', '.', 'Canada'],
 ['All',
  'residents',
  'asked',
  "'shelter",
  'place',
  "'",
  'notified',
  'officers',
  '.',
  'No',
  'evacuation',
  'shelter',
  'place',
  'orders',
  'expected'],
 ['13,000',
  'people',
  'receive',
  '#',
  'wildfires',
  'evacuation',
  'orders',
  'California'],
 ['Just',
  'got',
  'sent',
  'photo',
  'Ruby',
  '#',
  'Alaska',
  'smoke',
  '#',
  'wildfires',
  'pours',
  'school']]

In [11]:
texts_lowercased = [[word.lower() for word in text] for text in texts_without_stopwords]
texts_lowercased[:5]

[['our',
  'deeds',
  'reason',
  '#',
  'earthquake',
  'may',
  'allah',
  'forgive',
  'us'],
 ['forest', 'fire', 'near', 'la', 'ronge', 'sask', '.', 'canada'],
 ['all',
  'residents',
  'asked',
  "'shelter",
  'place',
  "'",
  'notified',
  'officers',
  '.',
  'no',
  'evacuation',
  'shelter',
  'place',
  'orders',
  'expected'],
 ['13,000',
  'people',
  'receive',
  '#',
  'wildfires',
  'evacuation',
  'orders',
  'california'],
 ['just',
  'got',
  'sent',
  'photo',
  'ruby',
  '#',
  'alaska',
  'smoke',
  '#',
  'wildfires',
  'pours',
  'school']]

In [12]:
spell = SpellChecker()

texts_corrected = [[spell.correction(word) for word in text] for text in texts_lowercased]
texts_corrected[:5]
# texts_corrected = texts_lowercased

[['our',
  'deeds',
  'reason',
  '#',
  'earthquake',
  'may',
  'allah',
  'forgive',
  'us'],
 ['forest', 'fire', 'near', 'la', 'range', 'ask', '.', 'canada'],
 ['all',
  'residents',
  'asked',
  'shelter',
  'place',
  "'",
  'notified',
  'officers',
  '.',
  'no',
  'evacuation',
  'shelter',
  'place',
  'orders',
  'expected'],
 ['13,000',
  'people',
  'receive',
  '#',
  'wildfires',
  'evacuation',
  'orders',
  'california'],
 ['just',
  'got',
  'sent',
  'photo',
  'ruby',
  '#',
  'alaska',
  'smoke',
  '#',
  'wildfires',
  'pours',
  'school']]

In [13]:
texts_expanded = [[contractions.fix(word) for word in text] for text in texts_corrected]
texts_expanded = [' '.join(text).split(' ') for text in texts_corrected]
texts_expanded[:5]

[['our',
  'deeds',
  'reason',
  '#',
  'earthquake',
  'may',
  'allah',
  'forgive',
  'us'],
 ['forest', 'fire', 'near', 'la', 'range', 'ask', '.', 'canada'],
 ['all',
  'residents',
  'asked',
  'shelter',
  'place',
  "'",
  'notified',
  'officers',
  '.',
  'no',
  'evacuation',
  'shelter',
  'place',
  'orders',
  'expected'],
 ['13,000',
  'people',
  'receive',
  '#',
  'wildfires',
  'evacuation',
  'orders',
  'california'],
 ['just',
  'got',
  'sent',
  'photo',
  'ruby',
  '#',
  'alaska',
  'smoke',
  '#',
  'wildfires',
  'pours',
  'school']]

In [14]:
punctuation_cleaned = [symbol for symbol in punctuation if symbol not in '!#?']
punctuation_cleaned = ''.join(punctuation_cleaned)
punctuation_cleaned

'"$%&\'()*+,-./:;<=>@[\\]^_`{|}~'

In [15]:
texts_without_punctuation = [[word for word in text if word not in punctuation_cleaned] for text in texts_expanded]
texts_without_punctuation[:5]

[['our',
  'deeds',
  'reason',
  '#',
  'earthquake',
  'may',
  'allah',
  'forgive',
  'us'],
 ['forest', 'fire', 'near', 'la', 'range', 'ask', 'canada'],
 ['all',
  'residents',
  'asked',
  'shelter',
  'place',
  'notified',
  'officers',
  'no',
  'evacuation',
  'shelter',
  'place',
  'orders',
  'expected'],
 ['13,000',
  'people',
  'receive',
  '#',
  'wildfires',
  'evacuation',
  'orders',
  'california'],
 ['just',
  'got',
  'sent',
  'photo',
  'ruby',
  '#',
  'alaska',
  'smoke',
  '#',
  'wildfires',
  'pours',
  'school']]

In [16]:
texts_final = [' '.join(text) for text in texts_without_punctuation]
texts_final[:5]

['our deeds reason # earthquake may allah forgive us',
 'forest fire near la range ask canada',
 'all residents asked shelter place notified officers no evacuation shelter place orders expected',
 '13,000 people receive # wildfires evacuation orders california',
 'just got sent photo ruby # alaska smoke # wildfires pours school']

In [17]:
with open('data/cleaned.pkl', 'wb') as f:
    pickle.dump(texts_final, f)

In [18]:
target = [train.at[i, 'target'] for i in range(train.shape[0])]
target[:5]

[1, 1, 1, 1, 1]

In [19]:
with open('data/target.pkl', 'wb') as f:
    pickle.dump(target, f)