In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
import re
import seaborn as sns
import matplotlib.pyplot as plt

  from pandas import Panel


In [8]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [9]:
from bs4 import BeautifulSoup
def cleanText(text):
    text = BeautifulSoup(text, "lxml").text
    text = re.sub(r'\|\|\|', r' ', text) 
    text = re.sub(r'http\S+', r'<URL>', text)
    text = text.lower()
    text = text.replace('x', '')
    return text
train.cleaned_contents = train.cleaned_contents.apply(cleanText)
test.cleaned_contents = test.cleaned_contents.apply(cleanText)

In [10]:
import nltk
from nltk.corpus import stopwords
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens
train_tagged = train.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['cleaned_contents']), tags=[r.Discrimination_Label]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['cleaned_contents']), tags=[r.Discrimination_Label]), axis=1)

In [11]:
train.head()

Unnamed: 0,docid,cleaned_contents,Discrimination_Label
0,73277,sentence\n\n\t1.\tyou are charged as follows:\...,0
1,79776,"sentence\n\n\t1.\tjosefa kotobalavu, you were ...",1
2,75870,sentence\n\n1. the director of public prosecut...,1
3,79299,"sentence\n\n\t1.\tmohommed nabi ud- dean, you ...",1
4,80603,judgment of the court\n\nbackground\n\n[1] the...,0


In [12]:
train_tagged.values[30]

TaggedDocument(words=['sentence', '1.', 'imanueli', 'senikuba', 'you', 'have', 'been', 'found', 'guilty', 'and', 'convicted', 'of', 'the', 'following', 'offence', 'for', 'which', 'you', 'were', 'charged', 'statement', 'of', 'offence', 'rape', 'contrary', 'to', 'section', '207', 'and', 'and', 'of', 'the', 'crimes', 'act', 'no', '44', 'of', '2009.', 'particulars', 'of', 'offence', 'imanueli', 'senikuba', 'sometime', 'between', '14th', 'day', 'of', 'august', '2015', 'and', 'the', '31st', 'day', 'of', 'august', '2015', 'at', 'vitina', 'village', 'in', 'dogotuki', 'in', 'the', 'northern', 'division', 'penetrated', 'the', 'vulva', 'of', 'sl', 'child', 'under', 'the', 'age', 'of', '13', 'years', 'with', 'his', 'tongue', '2.', 'you', 'pleaded', 'not', 'guilty', 'to', 'the', 'charge', 'and', 'the', 'ensuing', 'trial', 'lasted', 'for', 'days', 'the', 'complainant', 'sl', 'amelia', 'the', 'grandmother', 'of', 'sl', 'and', 'police', 'officer', 'who', 'was', 'involved', 'with', 'the', 'arrest', 'an

In [13]:
import multiprocessing
cores = multiprocessing.cpu_count()

In [15]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 647/647 [00:00<00:00, 697971.88it/s]


In [16]:
%%time
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 647/647 [00:00<00:00, 1176295.92it/s]
100%|██████████| 647/647 [00:00<00:00, 1544516.04it/s]
100%|██████████| 647/647 [00:00<00:00, 952849.26it/s]
100%|██████████| 647/647 [00:00<00:00, 1853630.25it/s]
100%|██████████| 647/647 [00:00<00:00, 1288563.48it/s]
100%|██████████| 647/647 [00:00<00:00, 1295947.80it/s]
100%|██████████| 647/647 [00:00<00:00, 1133075.03it/s]
100%|██████████| 647/647 [00:00<00:00, 828362.24it/s]
100%|██████████| 647/647 [00:00<00:00, 1241974.69it/s]
100%|██████████| 647/647 [00:00<00:00, 1352798.95it/s]
100%|██████████| 647/647 [00:00<00:00, 965734.76it/s]
100%|██████████| 647/647 [00:00<00:00, 1193891.20it/s]
100%|██████████| 647/647 [00:00<00:00, 1666900.91it/s]
100%|██████████| 647/647 [00:00<00:00, 1529715.16it/s]
100%|██████████| 647/647 [00:00<00:00, 1473243.59it/s]
100%|██████████| 647/647 [00:00<00:00, 1115836.63it/s]
100%|██████████| 647/647 [00:00<00:00, 848035.84it/s]
100%|██████████| 647/647 [00:00<00:00, 1265725.13it/s]
100%|█████████

CPU times: user 1min 17s, sys: 874 ms, total: 1min 18s
Wall time: 13.1 s


In [19]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [20]:
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
from sklearn.metrics import accuracy_score, f1_score
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.5555555555555556
Testing F1 score: 0.5262626262626262


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [21]:
model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=1, workers=5, alpha=0.065, min_alpha=0.065)
model_dmm.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 647/647 [00:00<00:00, 1027144.09it/s]


In [22]:
%%time
for epoch in range(30):
    model_dmm.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dmm.alpha -= 0.002
    model_dmm.min_alpha = model_dmm.alpha

100%|██████████| 647/647 [00:00<00:00, 744503.34it/s]
100%|██████████| 647/647 [00:00<00:00, 1302166.36it/s]
100%|██████████| 647/647 [00:00<00:00, 1296566.98it/s]
100%|██████████| 647/647 [00:00<00:00, 1650678.03it/s]
100%|██████████| 647/647 [00:00<00:00, 1662815.37it/s]
100%|██████████| 647/647 [00:00<00:00, 1026755.46it/s]
100%|██████████| 647/647 [00:00<00:00, 1137348.99it/s]
100%|██████████| 647/647 [00:00<00:00, 877089.43it/s]
100%|██████████| 647/647 [00:00<00:00, 1273446.59it/s]
100%|██████████| 647/647 [00:00<00:00, 1006197.51it/s]
100%|██████████| 647/647 [00:00<00:00, 1119519.26it/s]
100%|██████████| 647/647 [00:00<00:00, 1222944.88it/s]
100%|██████████| 647/647 [00:00<00:00, 1076870.91it/s]
100%|██████████| 647/647 [00:00<00:00, 1244823.25it/s]
100%|██████████| 647/647 [00:00<00:00, 1195995.90it/s]
100%|██████████| 647/647 [00:00<00:00, 1244823.25it/s]
100%|██████████| 647/647 [00:00<00:00, 1151342.68it/s]
100%|██████████| 647/647 [00:00<00:00, 1244823.25it/s]
100%|███████

CPU times: user 1min 40s, sys: 1.28 s, total: 1min 42s
Wall time: 23.6 s


In [23]:
y_train, X_train = vec_for_learning(model_dmm, train_tagged)
y_test, X_test = vec_for_learning(model_dmm, test_tagged)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.691358024691358
Testing F1 score: 0.6943587105624143


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
