# Data Preprocessing

In [45]:
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
import numpy as np
import seaborn as sns
import re
import pickle

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/makeschoolloaner/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/makeschoolloaner/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Load and Explore Data

In [2]:
import pandas as pd
df = pd.read_csv('WikiQA-train.tsv', sep='\t')
df.head()

Unnamed: 0,QuestionID,Question,DocumentID,DocumentTitle,SentenceID,Sentence,Label
0,Q1,how are glacier caves formed?,D1,Glacier cave,D1-0,A partly submerged glacier cave on Perito More...,0
1,Q1,how are glacier caves formed?,D1,Glacier cave,D1-1,The ice facade is approximately 60 m high,0
2,Q1,how are glacier caves formed?,D1,Glacier cave,D1-2,Ice formations in the Titlis glacier cave,0
3,Q1,how are glacier caves formed?,D1,Glacier cave,D1-3,A glacier cave is a cave formed within the ice...,1
4,Q1,how are glacier caves formed?,D1,Glacier cave,D1-4,"Glacier caves are often called ice caves , but...",0


### Check for Null Values

In [3]:
df.isnull().sum()

QuestionID       0
Question         0
DocumentID       0
DocumentTitle    0
SentenceID       0
Sentence         0
Label            0
dtype: int64

### Get Info about DataFrame

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20347 entries, 0 to 20346
Data columns (total 7 columns):
QuestionID       20347 non-null object
Question         20347 non-null object
DocumentID       20347 non-null object
DocumentTitle    20347 non-null object
SentenceID       20347 non-null object
Sentence         20347 non-null object
Label            20347 non-null int64
dtypes: int64(1), object(6)
memory usage: 1.1+ MB


## Helper Functions

### Clean Text

This function uses regex to remove HTML tags, punctuation, and numbers. It also converts words to lowercase and appends them to an array.

In [5]:
def clean_data(col):
    
    clean_txt = []
    
    for w in col:
        words = re.sub('<.*?>', '', w)
        words = re.sub(r'[^\w\s]', '', words)
        words = re.sub(r'\d+', '', words)
        words = words.lower()
        
        if words != '':
            clean_txt.append(words)
            
    return clean_txt

### Tokenize Words

This function uses NLTK to convert sentences into word tokens.

In [6]:
def tokenize_w(words):    
    tokens = [nltk.word_tokenize(w) for w in words if w != '']
    return tokens    

### Lemmatize Word Tokens

In [67]:
lemmatizer = WordNetLemmatizer()

def lemmatize_w(words):
    new_words = []
    for tokens in words:
        for word in tokens:
            new_words.append(lemmatizer.lemmatize(word))
    return new_words

In [68]:
clean_w = clean_data(df['Question'])
tokens = tokenize_w(clean_w)
lemms = lemmatize_w(tokens)
print(lemms[0:10])

['how', 'are', 'glacier', 'cave', 'formed', 'how', 'are', 'glacier', 'cave', 'formed']


### Sort Processed Text

In [103]:
def sort_processed(txt):
    sort = sorted(list(set(txt)))
    return sort

### Unique Classes

In [10]:
def unique(lst):
    unique_classes = []
    for i in lst:
        tags = tuple(i)
        if tags not in unique_classes:
            unique_classes.append(tags)
    return unique_classes

### Pairs Questions with Corresponding Tag

In [104]:
def tag_question(col1, col2):
    clean_docs = []
    clean_w = clean_data(col1)
    clean_tags = clean_data(col2)
    
    for i, j in zip(clean_w, clean_tags):
        clean_docs.append((i, j))
    return clean_docs

In [105]:
test = tag_question(df['Question'], df['DocumentTitle'])
print(test[0:10])

[('how are glacier caves formed', 'glacier cave'), ('how are glacier caves formed', 'glacier cave'), ('how are glacier caves formed', 'glacier cave'), ('how are glacier caves formed', 'glacier cave'), ('how are glacier caves formed', 'glacier cave'), ('how are the directions of the velocity and force vectors related in a circular motion', 'circular motion'), ('how are the directions of the velocity and force vectors related in a circular motion', 'circular motion'), ('how are the directions of the velocity and force vectors related in a circular motion', 'circular motion'), ('how are the directions of the velocity and force vectors related in a circular motion', 'circular motion'), ('how are the directions of the velocity and force vectors related in a circular motion', 'circular motion')]


### Preprocessing Pipeline

In [106]:
def preprocessing(col):
    clean_w = clean_data(col)
    tokens = tokenize_w(clean_w)
    lemms = lemmatize_w(tokens)
    w_clean = sort_processed(lemms)
    return w_clean

In [107]:
words = preprocessing(df['Question'])
print(words[0:100])

['a', 'aa', 'ability', 'able', 'abolished', 'abortion', 'about', 'abraham', 'academy', 'acarina', 'accessory', 'accidentally', 'accompanied', 'according', 'account', 'accused', 'acid', 'acquire', 'acquisition', 'acre', 'acronym', 'act', 'activated', 'active', 'activity', 'actor', 'actual', 'adam', 'adapter', 'add', 'address', 'adem', 'adenosine', 'adiabatic', 'adjustment', 'administered', 'administrative', 'adult', 'advantage', 'advisor', 'advocacy', 'aerosmith', 'affair', 'affect', 'affected', 'affinity', 'afge', 'afghanistan', 'africa', 'african', 'afrotc', 'after', 'afterimage', 'against', 'age', 'agency', 'agent', 'aggression', 'agi', 'agreement', 'aid', 'air', 'aircraft', 'airline', 'airplane', 'airport', 'ala', 'alarm', 'alaska', 'album', 'alcohol', 'aleppo', 'alert', 'algoma', 'algorithm', 'alighieri', 'alive', 'alkaline', 'alkaseltzer', 'all', 'allan', 'allow', 'along', 'alpine', 'alright', 'alt', 'aluminum', 'alvin', 'always', 'am', 'amber', 'ambersnail', 'amendment', 'america

### Creating Pickle Files and Storing Variables for Training

In [108]:
words = preprocessing(df['Question'])
classes = preprocessing(df['DocumentTitle'])
unique_classes = unique(classes)
documents = tag_question(df['Question'], df['DocumentTitle'])

%store words
%store classes
%store unique_classes
%store documents
%store lemmatizer

Stored 'words' (list)
Stored 'classes' (list)
Stored 'unique_classes' (list)
Stored 'documents' (list)
Stored 'lemmatizer' (WordNetLemmatizer)


In [111]:
pickle.dump(words,open('words.pkl','wb'))
pickle.dump(classes,open('classes.pkl','wb'))