In [5]:
## DATA MANIPULATION
import pandas as pd 
import numpy as np 
import json

## STRING MANIPULATION AND NLP HELP FUNS
import re, string, copy
import nltk
from nltk import WordNetLemmatizer

## FILE SAVING
import pickle

## SKLEARN
from sklearn.model_selection import train_test_split

### Load and split

In [2]:
data = pd.read_csv('../data/train.csv')             # load data
labels = list(data.columns[2:])                     # get labels
data['comment_text'].fillna("unknown", inplace=True)# fill empties
train, valid = train_test_split(data,               # split into train & test
                                random_state=42, 
                                test_size=0.33, 
                                shuffle=True)

### Preprocessor [all credit goes to fizzbuzz from kaggle](https://www.kaggle.com/fizzbuzz/toxic-data-preprocessing)

In [15]:
class PatternTokenizer(object):
    '''Preprocessor credit goes to fizzbuzz from kaggle 
    (https://www.kaggle.com/fizzbuzz/toxic-data-preprocessing)'''
    def __init__(self, lower=True, initial_filters=r"[^a-z0-9!@#\$%\^\&\*_\-,\.' ]", re_path='../data/re_patterns.json',
                 remove_repetitions=True):
        self.lower = lower
        self.re_path = re_path
        self.initial_filters = initial_filters
        self.remove_repetitions = remove_repetitions
        self.patterns = None
        
    def process_text(self, text):
        f = open(self.re_path, 'r')
        self.patterns = json.load(f)
        x = self._preprocess(text)
        for target, patterns in self.patterns.items():
            for pat in patterns:
                x = re.sub(pat, target, x)
        x = re.sub(r"[^a-z' ]", ' ', x)
        return x.split()

    def process_ds(self, ds):
        ### ds = Data series
        f = open(self.re_path, 'r')
        self.patterns = json.load(f)
        # lower
        ds = copy.deepcopy(ds)
        if self.lower:
            ds = ds.str.lower()
        # remove special chars
        if self.initial_filters is not None:
            ds = ds.str.replace(self.initial_filters, ' ')
        # fuuuuck => fuck
        if self.remove_repetitions:
            pattern = re.compile(r"(.)\1{2,}", re.DOTALL) 
            ds = ds.str.replace(pattern, r"\1")

        for target, patterns in self.patterns.items():
            for pat in patterns:
                ds = ds.str.replace(pat, target)

        ds = ds.str.replace(r"[^a-z' ]", ' ')

        return ds.str.split()

    def _preprocess(self, text):
        # lower
        if self.lower:
            text = text.lower()

        # remove special chars
        if self.initial_filters is not None:
            text = re.sub(self.initial_filters, ' ', text)

        # fuuuuck => fuck
        if self.remove_repetitions:
            pattern = re.compile(r"(.)\1{2,}", re.DOTALL)
            text = pattern.sub(r"\1", text)
        return text
        

tokenizer = PatternTokenizer()
train["comment_text"] = tokenizer.process_ds(train["comment_text"]).str.join(sep=" ")
valid["comment_text"] = tokenizer.process_ds(valid["comment_text"]).str.join(sep=" ")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [16]:
train['comment_text']

29614     sockpuppetry case you have been accused of soc...
109036    i've read the archives and various national an...
110790    wikipedia is an encyclopedia yes lyrics which ...
80583     such as when you mention azeris are geneticall...
30047     werdna's rfa hi i'm still slightly wet behind ...
                                ...                        
119879    redirect talk john loveday experimental physicist
103694     back it up post the line here with the reference
131932    i won't stop that sometimes germanic equals ge...
146867    british bands i think you've mistaken scottish...
121958    you are wrong justin thompson is mentioned in ...
Name: comment_text, Length: 106912, dtype: object

### Preprocess text
- no longer used
- now using pattern tokenizer above by Fizzbuzz from Kaggle

In [5]:
## SKLEARN COMPATIBLE TEXT PREPROCESSOR 
class preprocessComment(object):
    def __init__(self):
        # NLTK helper functions
        self.stop_words = nltk.corpus.stopwords.words('english')
        self.lemmatizer = nltk.stem.WordNetLemmatizer()
        self.tokenizer = nltk.word_tokenize

    def fit(self, X, y):
        return self
    
    def preprocess(self, s):
        '''
        Input:
        Sentence string

        Transformations:
        Lower case -> Remove stop words ->
        Remove non-words -> Lemmatize -> 

        Output:
        Preprocessed sentence string
        '''
        words = str(s).lower()
        words = self.tokenizer(words)
        words = [w for w in words if not w in self.stop_words]
        words = [w for w in words if w.isalpha()]
        words = [self.lemmatizer.lemmatize(w) for w in words]
        return ' '.join(words)

    def transform(self, X):
        return X.apply(self.preprocess)

## PREPROCESS DATA
# proc = preprocessComment()
# train['comment_text'] = proc.transform(train['comment_text'])
# valid['comment_text'] = proc.transform(valid['comment_text'])

### Save preprocessed files

In [6]:
f = open('../data/preprocessed.pkl','wb')
pickle.dump((train,valid),f)