In [1]:
## DATA MANIPULATION
import pandas as pd
import numpy as np

## NLTK HELPER FUNCTIONS
from nltk import word_tokenize
from nltk import punkt
from nltk.corpus import stopwords

## CLASSIFICATION
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.svm import LinearSVC

In [2]:
## LOAD, CLEAN, SPLIT DATA
data = pd.read_csv('../data/train.csv')
data['comment_text'].fillna("unknown", inplace=True)
train, valid = train_test_split(data, random_state=42, test_size=0.33, shuffle=True)
X_train = train['comment_text']
X_valid = valid['comment_text']
print(X_train.shape, X_valid.shape)

(106912,) (52659,)


In [3]:
labels = list(data.columns[2:])

In [4]:
## BUILD GLOVE EMBEDDINGS DICTIONARY
embeddings_dict = dict()
f = open(r'../data/glove.6B.300d.txt', encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    vec = np.asarray(values[1:], dtype='float32')
    embeddings_dict[word] = vec
f.close()
print('Extracted {} word vectors'.format(len(embeddings_dict)))

Extracted 400000 word vectors


In [5]:
## FUNCTION THAT CONVERTS COMMENTS TO VEC
stop_words = stopwords.words('english')
def comment2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_dict[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

In [6]:
## CONVERT COMMENTS TO VEC
X_train_glove = X_train.apply(comment2vec)
X_valid_glove = X_valid.apply(comment2vec)
X_train_glove = np.stack(X_train_glove.values, axis=0)
X_valid_glove = np.stack(X_valid_glove.values, axis=0)

In [7]:
X_train_glove

array([[-0.01483988,  0.0263314 , -0.0679364 , ..., -0.05361094,
        -0.02683323,  0.02075227],
       [-0.02131595,  0.01925481, -0.01717966, ...,  0.00667108,
        -0.01023916,  0.00571783],
       [-0.04826125, -0.01935945, -0.03799327, ...,  0.02294068,
        -0.02901444, -0.04871771],
       ...,
       [ 0.00313657,  0.02736013, -0.01489128, ..., -0.03033283,
         0.0095365 ,  0.0128214 ],
       [-0.00935701, -0.01064557,  0.00927812, ...,  0.04194981,
        -0.05753804, -0.00346779],
       [-0.02948582,  0.0032135 , -0.03273927, ..., -0.0174906 ,
         0.00701386,  0.04236213]])

In [136]:
## CREATE A MODEL FOR EACH CATEGORY
for label in labels:
    # Create and fit model
    m = LinearSVC(class_weight='balanced')  # Balanced class weights [n_samples / n_classes * np.bincount(y)]
    m.fit(X_train_glove, train[label].values)
    # Get predictions
    preds = m.predict(X_valid_glove)
    # Evaluate predictions
    print('Validation accuracy for {0} comments is {1:.2f}, with precision score of {2:.2f} and recall score of {3:.2f}'.format(
                                    label, 
                                    accuracy_score(valid[label], preds), 
                                    precision_score(valid[label], preds), 
                                    recall_score(valid[label], preds)))

Validation accuracy for toxic comments is 0.93, with precision score of 0.81 and recall score of 0.40
Validation accuracy for severe_toxic comments is 0.99, with precision score of 0.50 and recall score of 0.00
Validation accuracy for obscene comments is 0.96, with precision score of 0.81 and recall score of 0.39
Validation accuracy for threat comments is 1.00, with precision score of 0.00 and recall score of 0.00
Validation accuracy for insult comments is 0.96, with precision score of 0.76 and recall score of 0.32
Validation accuracy for identity_hate comments is 0.99, with precision score of 1.00 and recall score of 0.01
