In [1]:
## DATA MANIPULATION
import numpy as np
import pickle

## CLASSIFICATION
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

from xgboost import XGBClassifier

### LOAD DATA AND GET LABELS

In [2]:
f = open('../data/preprocessed.pkl','rb')
train, valid = pickle.load(f)
labels = train.columns[2:]

### GloVe vectorizer

In [3]:
## BUILD GLOVE EMBEDDINGS DICTIONARY
embeddings_dict = dict()
f = open(r'../data/glove.6B.300d.txt', encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    vec = np.asarray(values[1:], dtype='float32')
    embeddings_dict[word] = vec
f.close()
print('Extracted {} word vectors'.format(len(embeddings_dict)))

Extracted 400000 word vectors


In [4]:
## SKLEARN COMPATIBLE GLOVE VECTORIZER TRANSFORMER 
class gloveVectorizer(object):
    def __init__(self, embeddings_dict):
        self.embeddings_dict = embeddings_dict
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(embeddings_dict[next(iter(embeddings_dict))])

    def fit(self, X, y):
        return self
    
    def sentence2vec(self, s):
        '''
        Input:
        Sentence string

        Transformations:
        Get vector for each word -> Average vectors

        Output:
        Vector for sentence
        '''
        words = s.split()
        M = []
        for w in words:
            try:
                M.append(self.embeddings_dict[w])
            except:
                continue
        M = np.array(M)
        v = M.mean(axis=0)
        if type(v) != np.ndarray:
            return np.zeros(self.dim)
        return v # / np.sqrt((v ** 2).sum())

    def transform(self, X):
        X = X.apply(self.sentence2vec)
        return np.stack(X.values, axis=0)

In [5]:
glove = gloveVectorizer(embeddings_dict)
X_train = glove.transform(train['comment_text'])
X_valid = glove.transform(valid['comment_text'])

### SVM

In [6]:
## CREATE A MODEL FOR EACH CATEGORY
for label in labels:
    # Create and fit model
    m = LinearSVC(class_weight='balanced')
    m.fit(X_train, train[label].values)
    # Get predictions
    preds = m.predict(X_valid)
    # Evaluate predictions
    print('Validation accuracy for {0} comments is {1:.2f}, with precision score of {2:.2f} and recall score of {3:.2f}'.format(
                                    label, 
                                    accuracy_score(valid[label], preds), 
                                    precision_score(valid[label], preds), 
                                    recall_score(valid[label], preds)))

Validation accuracy for toxic comments is 0.89, with precision score of 0.46 and recall score of 0.83
Validation accuracy for severe_toxic comments is 0.95, with precision score of 0.14 and recall score of 0.87
Validation accuracy for obscene comments is 0.92, with precision score of 0.38 and recall score of 0.84
Validation accuracy for threat comments is 0.96, with precision score of 0.05 and recall score of 0.77
Validation accuracy for insult comments is 0.91, with precision score of 0.35 and recall score of 0.85
Validation accuracy for identity_hate comments is 0.93, with precision score of 0.10 and recall score of 0.86


### LOGISTIC REGRESSION

In [7]:
## CREATE A MODEL FOR EACH CATEGORY
for label in labels:
    # Create & Fit model
    m = LogisticRegression(solver='saga',class_weight='balanced')
    m.fit(X_train, train[label])
    # Get predictions
    preds = m.predict(X_valid)
    # Evaluate predictions
    print('Validation accuracy for {0} comments is {1:.2f}, with precision score of {2:.2f} and recall score of {3:.2f}'.format(
                                    label, 
                                    accuracy_score(valid[label], preds), 
                                    precision_score(valid[label], preds), 
                                    recall_score(valid[label], preds)))

Validation accuracy for toxic comments is 0.89, with precision score of 0.45 and recall score of 0.84
Validation accuracy for severe_toxic comments is 0.88, with precision score of 0.07 and recall score of 0.92
Validation accuracy for obscene comments is 0.91, with precision score of 0.37 and recall score of 0.85
Validation accuracy for threat comments is 0.92, with precision score of 0.03 and recall score of 0.84
Validation accuracy for insult comments is 0.91, with precision score of 0.34 and recall score of 0.85
Validation accuracy for identity_hate comments is 0.89, with precision score of 0.07 and recall score of 0.89


### XGBOOST

In [8]:
## CREATE A MODEL FOR EACH CATEGORY
for label in labels:
    # Create & Fit model
    m = XGBClassifier(n_estimators=100,
                      scale_pos_weight= sum(train[label]==0) / sum(train[label]==1),
                      n_jobs=-1)
    m.fit(X_train, train[label])
    # Get predictions
    preds = m.predict(X_valid)
    # Evaluate predictions
    print('Validation accuracy for {0} comments is {1:.2f}, with precision score of {2:.2f} and recall score of {3:.2f}'.format(
                                    label, 
                                    accuracy_score(valid[label], preds), 
                                    precision_score(valid[label], preds), 
                                    recall_score(valid[label], preds)))

Validation accuracy for toxic comments is 0.93, with precision score of 0.61 and recall score of 0.72
Validation accuracy for severe_toxic comments is 0.99, with precision score of 0.35 and recall score of 0.42
Validation accuracy for obscene comments is 0.97, with precision score of 0.68 and recall score of 0.70
Validation accuracy for threat comments is 1.00, with precision score of 0.58 and recall score of 0.29
Validation accuracy for insult comments is 0.96, with precision score of 0.58 and recall score of 0.67
Validation accuracy for identity_hate comments is 0.99, with precision score of 0.41 and recall score of 0.34
