In [1]:
## DATA MANIPULATION
import numpy as np, pandas as pd
import pickle

## CLASSIFICATION
from sklearn.base import clone
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

from xgboost import XGBClassifier

### Load data & grab labels

In [2]:
f = open('../data/preprocessed.pkl','rb')
train, valid = pickle.load(f)
labels = train.columns[2:]

### Convert pretrained word2vec embeddings (binary format) to text file using gensim.

In [3]:
## Convert word2vec to .txt file
# from gensim.models.keyedvectors import KeyedVectors
# model = KeyedVectors.load_word2vec_format('../data/GoogleNews-vectors-negative300.bin', binary=True)
# model.save_word2vec_format('../data/GoogleNews-vectors-negative300.txt', binary=False)

### Extract embedding vectors from word2vec

In [4]:
## BUILD WORD2VEC EMBEDDINGS DICTIONARY
embeddings_dict = dict()
f = open(r'../data/GoogleNews-vectors-negative300.txt',encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    vec = np.asarray(values[1:], dtype='float32')
    embeddings_dict[word] = vec
f.close()
print('Extracted {} word vectors'.format(len(embeddings_dict)))

Extracted 3000000 word vectors


### Word2vec vectorizer

In [5]:
## SKLEARN COMPATIBLE GLOVE VECTORIZER TRANSFORMER 
class w2cVectorizer(object):
    def __init__(self, embeddings_dict):
        self.embeddings_dict = embeddings_dict
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        # self.dim = len(embeddings_dict[next(iter(embeddings_dict))])

    def fit(self, X, y):
        return self
    
    def sentence2vec(self, s):
        '''
        Input:
        Sentence string

        Transformations:
        Get vector for each word -> Average vectors

        Output:
        Vector for sentence
        '''
        words = s.split()
        M = []
        for w in words:
            try:
                M.append(self.embeddings_dict[w])
            except:
                continue
        M = np.array(M)
        v = M.mean(axis=0)
        if type(v) != np.ndarray:
            # return np.zeros(self.dim)
            return np.zeros(300)
        return v # / np.sqrt((v ** 2).sum())

    def transform(self, X):
        X = X.apply(self.sentence2vec)
        return np.stack(X.values, axis=0)

In [6]:
w2c = w2cVectorizer(embeddings_dict)
X_train = w2c.transform(train['comment_text'])
X_valid = w2c.transform(valid['comment_text'])

### Loop through Logistic Regression, SVM, XGBoost

In [7]:
## CREATE RESULTS TABLE
results = pd.DataFrame(columns=['Label','Accuracy', 'Recall', 'Precision', 'F1', 'Vectorizer', 'model'])

## CREATE MODELS
models = {
    'Logistic Regression': LogisticRegression(solver='saga',class_weight='balanced'),
    'SVM': LinearSVC(class_weight='balanced'),
    'XGBoost': ['see below']}
                      
## LOOP THROUGH MODELS
for m_label, model in models.items():
    for label in labels:
        # clone base model (re-initialize weights)
        if m_label == 'XGBoost':
            model = XGBClassifier(n_estimators=100,
                        scale_pos_weight= sum(train[label]==0) / sum(train[label]==1),
                        n_jobs=-1)
        
        m = clone(model) 

        # Fit model
        m.fit(X_train, train[label])
        
        # Get predictions
        preds = m.predict(X_valid)

        # Evaluate predictions
        acc, prec, recall, f1 = (accuracy_score(valid[label], preds), 
                                precision_score(valid[label], preds), 
                                recall_score(valid[label], preds), 
                                f1_score(valid[label], preds))
        
        # Save results to dataframe
        results = results.append({'Label': label,
                                'Accuracy':acc,
                                'Recall':recall,
                                'Precision':prec,
                                'F1':f1,
                                'Vectorizer':'word2vec',
                                'model': m_label}, 
                                ignore_index = True)
        
        # print results
        print('{0} Results for {1} comments: Accuracy - {2:.2f}; Precision - {3:.2f}; Recall - {4:.2f}; F1 - {5:.2f}'.format(
                                        m_label,
                                        label, 
                                        acc, 
                                        prec, 
                                        recall,
                                        f1))

Logistic Regression Results for toxic comments: Accuracy - 0.91; Precision - 0.51; Recall - 0.87; F1 - 0.64
Logistic Regression Results for severe_toxic comments: Accuracy - 0.95; Precision - 0.17; Recall - 0.90; F1 - 0.28
Logistic Regression Results for obscene comments: Accuracy - 0.94; Precision - 0.45; Recall - 0.88; F1 - 0.60
Logistic Regression Results for threat comments: Accuracy - 0.92; Precision - 0.03; Recall - 0.90; F1 - 0.06
Logistic Regression Results for insult comments: Accuracy - 0.92; Precision - 0.39; Recall - 0.88; F1 - 0.54
Logistic Regression Results for identity_hate comments: Accuracy - 0.93; Precision - 0.10; Recall - 0.88; F1 - 0.17
SVM Results for toxic comments: Accuracy - 0.91; Precision - 0.52; Recall - 0.86; F1 - 0.65
SVM Results for severe_toxic comments: Accuracy - 0.95; Precision - 0.16; Recall - 0.90; F1 - 0.27
SVM Results for obscene comments: Accuracy - 0.94; Precision - 0.46; Recall - 0.87; F1 - 0.61
SVM Results for threat comments: Accuracy - 0.95

In [8]:
## SAVE RESULTS
results.to_csv('../artifacts/word2vec.csv', index=False)