In [1]:
## DATA MANIPULATION
import pandas as pd
import numpy as np

## NLP HELPER FUNS
import nltk

## SKLEARN FUNS
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

In [2]:
## LOAD, CLEAN, SPLIT DATA
data = pd.read_csv('../data/train.csv')
data['comment_text'].fillna("unknown", inplace=True)
train, valid = train_test_split(data, random_state=42, test_size=0.33, shuffle=True)
X_train = train['comment_text']
X_valid = valid['comment_text']
print(X_train.shape, X_valid.shape)

(106912,) (52659,)


In [3]:
## GET LABELS FOR EACH CATEGORY 
labels = list(data.columns[2:])

In [4]:
## BUILD GLOVE EMBEDDINGS DICTIONARY
embeddings_dict = dict()
f = open(r'../data/glove.6B.300d.txt', encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    vec = np.asarray(values[1:], dtype='float32')
    embeddings_dict[word] = vec
f.close()
print('Extracted {} word vectors'.format(len(embeddings_dict)))

Extracted 400000 word vectors


In [5]:
## SKLEARN COMPATIBLE GLOVE VECTORIZER TRANSFORMER 
class gloveVectorizer(object):
    def __init__(self, embeddings_dict):
        self.embeddings_dict = embeddings_dict
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(embeddings_dict[next(iter(embeddings_dict))])
        # NLTK helper functions
        self.stop_words = nltk.corpus.stopwords.words('english')
        self.lemmatizer = nltk.stem.WordNetLemmatizer()
        self.tokenizer = nltk.word_tokenize

    def fit(self, X, y):
        return self
    
    def sentence2vec(self, s):
        '''
        Input:
        Sentence string

        Transformations:
        Lower case -> Tokenize -> Remove stop words ->
        Remove non-words -> Lemmatize -> Get vector for each word ->
        Average vectors

        Output:
        Vector for sentence
        '''
        words = str(s).lower()
        words = self.tokenizer(words)
        words = [w for w in words if not w in self.stop_words]
        words = [w for w in words if w.isalpha()]
        words = [self.lemmatizer.lemmatize(w) for w in words]
        M = []
        for w in words:
            try:
                M.append(self.embeddings_dict[w])
            except:
                continue
        M = np.array(M)
        v = M.mean(axis=0)
        if type(v) != np.ndarray:
            return np.zeros(self.dim)
        return v # / np.sqrt((v ** 2).sum())

    def transform(self, X):
        X = X.apply(self.sentence2vec)
        return np.stack(X.values, axis=0)

In [6]:
## SKLEARN PIPELINE
pipeline = Pipeline([('GloVe Vectorizer', gloveVectorizer(embeddings_dict)), 
                     ('Linear SVM', LinearSVC(class_weight='balanced'))])

## CREATE A MODEL FOR EACH CATEGORY
for label in labels:
    # Create and fit pipeline
    pipeline.fit(X_train, train[label].values)
    # Get predictions
    preds = pipeline.predict(X_valid)
    # Evaluate predictions
    print('Validation accuracy for {0} comments is {1:.2f}, with precision score of {2:.2f} and recall score of {3:.2f}'.format(
                                    label, 
                                    accuracy_score(valid[label], preds), 
                                    precision_score(valid[label], preds), 
                                    recall_score(valid[label], preds)))

Validation accuracy for toxic comments is 0.89, with precision score of 0.46 and recall score of 0.83
Validation accuracy for severe_toxic comments is 0.95, with precision score of 0.14 and recall score of 0.87
Validation accuracy for obscene comments is 0.92, with precision score of 0.38 and recall score of 0.84
Validation accuracy for threat comments is 0.96, with precision score of 0.05 and recall score of 0.78
Validation accuracy for insult comments is 0.91, with precision score of 0.35 and recall score of 0.85
Validation accuracy for identity_hate comments is 0.92, with precision score of 0.09 and recall score of 0.86
