In [1]:
## DATA MANIPULATION
import numpy as np
import pickle

## CLASSIFICATION
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

from xgboost import XGBClassifier

### LOAD DATA AND GET LABELS

In [2]:
f = open('../data/preprocessed.pkl','rb')
train, valid = pickle.load(f)
labels = train.columns[2:]

### GloVe vectorizer

In [3]:
## BUILD GLOVE EMBEDDINGS DICTIONARY
embeddings_dict = dict()
f = open(r'../data/glove.6B.300d.txt', encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    vec = np.asarray(values[1:], dtype='float32')
    embeddings_dict[word] = vec
f.close()
print('Extracted {} word vectors'.format(len(embeddings_dict)))

Extracted 400000 word vectors


In [4]:
## SKLEARN COMPATIBLE GLOVE VECTORIZER TRANSFORMER 
class gloveVectorizer(object):
    def __init__(self, embeddings_dict):
        self.embeddings_dict = embeddings_dict
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(embeddings_dict[next(iter(embeddings_dict))])

    def fit(self, X, y):
        return self
    
    def sentence2vec(self, s):
        '''
        Input:
        Sentence string

        Transformations:
        Get vector for each word -> Average vectors

        Output:
        Vector for sentence
        '''
        words = s.split()
        M = []
        for w in words:
            try:
                M.append(self.embeddings_dict[w])
            except:
                continue
        M = np.array(M)
        v = M.mean(axis=0)
        if type(v) != np.ndarray:
            return np.zeros(self.dim)
        return v # / np.sqrt((v ** 2).sum())

    def transform(self, X):
        X = X.apply(self.sentence2vec)
        return np.stack(X.values, axis=0)

In [5]:
glove = gloveVectorizer(embeddings_dict)
X_train = glove.transform(train['comment_text'])
X_valid = glove.transform(valid['comment_text'])

### SVM

In [6]:
## CREATE A MODEL FOR EACH CATEGORY
for label in labels:
    # Create and fit model
    m = LinearSVC(class_weight='balanced')
    m.fit(X_train, train[label].values)
    # Get predictions
    preds = m.predict(X_valid)
    # Evaluate predictions
    print('Results for {0} comments: Accuracy - {1:.2f}; Precision - {2:.2f}; Recall - {3:.2f}; F1 - {4:.2f}'.format(
                                    label, 
                                    accuracy_score(valid[label], preds), 
                                    precision_score(valid[label], preds), 
                                    recall_score(valid[label], preds),
                                    f1_score(valid[label], preds)))

Results for toxic comments: Accuracy - 0.90; Precision - 0.50; Recall - 0.85; F1 - 0.63
Results for severe_toxic comments: Accuracy - 0.96; Precision - 0.17; Recall - 0.89; F1 - 0.29
Results for obscene comments: Accuracy - 0.94; Precision - 0.46; Recall - 0.86; F1 - 0.60
Results for threat comments: Accuracy - 0.97; Precision - 0.06; Recall - 0.74; F1 - 0.12
Results for insult comments: Accuracy - 0.92; Precision - 0.38; Recall - 0.87; F1 - 0.53
Results for identity_hate comments: Accuracy - 0.94; Precision - 0.11; Recall - 0.85; F1 - 0.19


### LOGISTIC REGRESSION

In [7]:
## CREATE A MODEL FOR EACH CATEGORY
for label in labels:
    # Create & Fit model
    m = LogisticRegression(solver='saga',class_weight='balanced')
    m.fit(X_train, train[label])
    # Get predictions
    preds = m.predict(X_valid)
    # Evaluate predictions
    print('Results for {0} comments: Accuracy - {1:.2f}; Precision - {2:.2f}; Recall - {3:.2f}; F1 - {4:.2f}'.format(
                                    label, 
                                    accuracy_score(valid[label], preds), 
                                    precision_score(valid[label], preds), 
                                    recall_score(valid[label], preds),
                                    f1_score(valid[label], preds)))

Results for toxic comments: Accuracy - 0.90; Precision - 0.49; Recall - 0.85; F1 - 0.62
Results for severe_toxic comments: Accuracy - 0.94; Precision - 0.13; Recall - 0.94; F1 - 0.23
Results for obscene comments: Accuracy - 0.93; Precision - 0.43; Recall - 0.87; F1 - 0.58
Results for threat comments: Accuracy - 0.97; Precision - 0.07; Recall - 0.74; F1 - 0.12
Results for insult comments: Accuracy - 0.92; Precision - 0.38; Recall - 0.88; F1 - 0.53
Results for identity_hate comments: Accuracy - 0.91; Precision - 0.09; Recall - 0.91; F1 - 0.16


### XGBOOST

In [8]:
## CREATE A MODEL FOR EACH CATEGORY
for label in labels:
    # Create & Fit model
    m = XGBClassifier(n_estimators=100,
                      scale_pos_weight= sum(train[label]==0) / sum(train[label]==1),
                      n_jobs=-1)
    m.fit(X_train, train[label])
    # Get predictions
    preds = m.predict(X_valid)
    # Evaluate predictions
    print('Results for {0} comments: Accuracy - {1:.2f}; Precision - {2:.2f}; Recall - {3:.2f}; F1 - {4:.2f}'.format(
                                    label, 
                                    accuracy_score(valid[label], preds), 
                                    precision_score(valid[label], preds), 
                                    recall_score(valid[label], preds),
                                    f1_score(valid[label], preds)))

Results for toxic comments: Accuracy - 0.94; Precision - 0.65; Recall - 0.75; F1 - 0.70
Results for severe_toxic comments: Accuracy - 0.99; Precision - 0.43; Recall - 0.43; F1 - 0.43
Results for obscene comments: Accuracy - 0.97; Precision - 0.75; Recall - 0.73; F1 - 0.74
Results for threat comments: Accuracy - 1.00; Precision - 0.48; Recall - 0.28; F1 - 0.36
Results for insult comments: Accuracy - 0.96; Precision - 0.63; Recall - 0.68; F1 - 0.66
Results for identity_hate comments: Accuracy - 0.99; Precision - 0.52; Recall - 0.36; F1 - 0.42
