In [4]:
## DATA MANIPULATION
import pandas as pd 
import numpy as np 

## STRING MANIPULATION AND NLP HELP FUNS
import re, string
import nltk

## FILE SAVING
import pickle

## SKLEARN
from sklearn.model_selection import train_test_split

### Load and split

In [5]:
data = pd.read_csv('../data/train.csv')             # load data
labels = list(data.columns[2:])                     # get labels
data['comment_text'].fillna("unknown", inplace=True)# fill empties
train, valid = train_test_split(data,               # split into train & test
                                random_state=42, 
                                test_size=0.33, 
                                shuffle=True)

### Preprocess text

In [7]:
## SKLEARN COMPATIBLE TEXT PREPROCESSOR 
class preprocessComment(object):
    def __init__(self):
        # NLTK helper functions
        self.stop_words = nltk.corpus.stopwords.words('english')
        self.lemmatizer = nltk.stem.WordNetLemmatizer()
        self.tokenizer = nltk.word_tokenize

    def fit(self, X, y):
        return self
    
    def preprocess(self, s):
        '''
        Input:
        Sentence string

        Transformations:
        Lower case -> Remove stop words ->
        Remove non-words -> Lemmatize -> 

        Output:
        Preprocessed sentence string
        '''
        words = str(s).lower()
        words = self.tokenizer(words)
        words = [w for w in words if not w in self.stop_words]
        words = [w for w in words if w.isalpha()]
        words = [self.lemmatizer.lemmatize(w) for w in words]
        return ' '.join(words)

    def transform(self, X):
        return X.apply(self.preprocess)

In [8]:
## PREPROCESS DATA
proc = preprocessComment()
train['comment_text'] = proc.transform(train['comment_text'])
valid['comment_text'] = proc.transform(valid['comment_text'])

### Save preprocessed files

In [16]:
f = open('../data/preprocessed.pkl','wb')
pickle.dump((train,valid),f)