In [1]:
import spacy
from tqdm import tqdm, tqdm_notebook
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from collections import Counter
import numpy as np

In [2]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as dsets
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.optim as optim
import torch.nn.functional as F

In [6]:
from nltk.tokenize import sent_tokenize, word_tokenize 
import math
import pandas

## Loading Data

In [7]:
data = pandas.read_csv('./sexism-data.csv')

In [8]:
data

Unnamed: 0,texts,scores,class
0,"So begin today. Each criticism that you face, ...",0,0
1,It's Sunday. Y'all dirty minded people should ...,0,0
2,Whoever decided the phrase smash your backdoor...,0,0
3,Trying to convince Adam to make me some flower...,1,2
4,i wore this dress last year but now its too sh...,0,0
...,...,...,...
5539,Follow us for more Feminist stories & quotes. . .,0,0
5540,rich white kids - - - - - - - - - - sivememes,0,0
5541,Got rid of the drabby brown and those shiney r...,0,0
5542,piggy longevily starred at me twice as I passe...,0,0


In [9]:
train_data, test_data = train_test_split(data)

In [21]:
words = Counter()
word2idx = {}
idx2word = {}

def tokenizeText(sentence):
    tokens = word_tokenize(sentence)
    
    return tokens

def sent2idx(split_text):
    sent2idx = []
    for w in split_text:
        if w.lower() in word2idx:
            sent2idx.append(word2idx[w.lower()])
        else:
            sent2idx.append(word2idx['_UNK'])
            
    return sent2idx

def processTextData(df,isTrain):
    global words
    global word2idx
    global idx2word
    df = df.copy()
    
    df['tokenized'] = df.texts.apply(lambda x: (tokenizeText(x.lower())))
    
    if isTrain:
        for sent in tqdm(df.tokenized.values):
            words.update(w for w in sent)

        words = sorted(words, key=words.get, reverse=True)
        words = ['_PAD','_UNK'] + words

        word2idx = {o:i for i,o in enumerate(words)}
        idx2word = {i:o for i,o in enumerate(words)}
        
    df['vectorized'] = df.texts.apply(lambda x: sent2idx(x))
    
    return df

In [22]:
train_data = processTextData(train_data,True)
test_data = processTextData(test_data,False)

100%|██████████| 4158/4158 [00:00<00:00, 72578.01it/s]


In [23]:
train_data

Unnamed: 0,texts,scores,class,tokenized,vectorized
5504,Well his specialisation must be in the anatomy...,0,0,"[well, his, specialisation, must, be, in, the,...","[3883, 754, 939, 939, 1, 1836, 8, 396, 1, 396,..."
3769,r*pe i am so fucking sick and tired of victim...,0,0,"[r*pe, i, am, so, fucking, sick, and, tired, o...","[918, 241, 1767, 754, 1, 1, 8, 1, 7, 865, 1, 3..."
165,We were raised to believe that many things in ...,0,0,"[we, were, raised, to, believe, that, many, th...","[3883, 754, 1, 3883, 754, 918, 754, 1, 918, 7,..."
1302,"""Shine bright like a diamond!"" IKHTIYAAR",0,0,"[``, shine, bright, like, a, diamond, !, '', i...","[1, 396, 1836, 8, 841, 754, 1, 1434, 918, 8, 3..."
3545,The tragic and sad story of the boat that coul...,0,0,"[the, tragic, and, sad, story, of, the, boat, ...","[1156, 1836, 754, 1, 1156, 918, 7, 3165, 8, 12..."
...,...,...,...,...,...
3929,Time just flies by doesn't it?! Can't believe ...,0,0,"[time, just, flies, by, does, n't, it, ?, !, c...","[1156, 8, 865, 754, 1, 1, 394, 396, 1156, 1, 2..."
5310,"This is story number 811. This takes , , and ....",0,0,"[this, is, story, number, 811., this, takes, ,...","[1156, 1836, 8, 396, 1, 8, 396, 1, 396, 1156, ..."
4225,This guys IQ level is,0,0,"[this, guys, iq, level, is]","[1156, 1836, 8, 396, 1, 3165, 394, 1190, 396, ..."
2797,How many times do we have to share our stories...,0,0,"[how, many, times, do, we, have, to, share, ou...","[1836, 800, 3883, 1, 865, 7, 841, 1190, 1, 115..."


In [37]:
class VectorizeData(Dataset):
    def __init__(self, df, maxlen=10):
        self.maxlen = maxlen
        self.df = df
        self.df['text_padded'] = self.df.vectorized.apply(lambda x: self.pad_data(x))
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        text = self.df.text_padded.values[idx]
        sexism_label = self.df.scores.values[idx]
        sexism_type = self.df['class'].values[idx]
        return text,sexism_label,sexism_type
    
    def pad_data(self, s):
        padded = np.zeros((self.maxlen,), dtype=np.int64)
        if len(s) > self.maxlen: padded[:] = s[:self.maxlen]
        else: padded[:len(s)] = s
        return padded

In [38]:
trainDataset = VectorizeData(train_data)
testDataset = VectorizeData(test_data)

In [39]:
trainLoader = DataLoader(dataset=trainDataset, batch_size=100, shuffle=True)
testLoader = DataLoader(dataset=testDataset, batch_size=100, shuffle=False)

In [40]:
for i, samples in enumerate(trainLoader):
    print(i)
    print(samples[0])
    print(samples[1])
    print(samples[2])
    break

0
tensor([[   1,  394,  396, 1156,    1, 1767,  939,    7, 1190,    1],
        [3883,    7,    8, 1156,    1, 2633,  800,  918,    1,  865],
        [2633,  918,  800,  865,    1, 3883,    8, 1156, 1836,    1],
        [ 396,  800,    1, 3165,  918,    7, 1156,  754, 2633,  394],
        [3883,    7,  396,    1, 3883,  800,  841, 1320,  754,  918],
        [ 841,  800,    2,    1,    1,  394,  396, 1156,    1,  841],
        [ 800,    1,  841,    1,  754,    1, 1320,    1,    8,    1],
        [   8,    1, 1320,  800,  841, 1156,    1, 1320,  754, 1434],
        [1320,    8, 1320,    1, 1836,  754,    1,    1,  394,  396],
        [3883, 1836,    7, 1156,    1,    8,  396,    1, 3165,  800],
        [1320,    8,  396, 1156,  918,    8, 1434,  394, 1156,  754],
        [   8,    1, 1767,  394, 1156,    1, 1156, 1836,  754,    1],
        [ 939,    8, 2633,  754,    1,    8,  396,    1,    7,  865],
        [   7,    1,  939,    8, 1156, 1156,  939,  754,    1, 1156],
        [1836,    

In [45]:
vectors = []

In [46]:
wordVectors = {}
with open(f'./glove.6B.300d.txt', 'rb') as f:
    for l in tqdm(f,total=400001):
        line = l.decode().split()
        word = line[0]
        
        vect = np.array(line[1:]).astype(np.float)
        wordVectors[word] = vect


  0%|          | 0/400001 [00:00<?, ?it/s][A
  0%|          | 468/400001 [00:00<01:25, 4678.16it/s][A
  0%|          | 938/400001 [00:00<01:25, 4684.24it/s][A
  0%|          | 1411/400001 [00:00<01:24, 4696.34it/s][A
  0%|          | 1883/400001 [00:00<01:24, 4701.02it/s][A
  1%|          | 2355/400001 [00:00<01:24, 4706.15it/s][A
  1%|          | 2826/400001 [00:00<01:24, 4705.98it/s][A
  1%|          | 3297/400001 [00:00<01:24, 4704.33it/s][A
  1%|          | 3764/400001 [00:00<01:24, 4692.83it/s][A
  1%|          | 4236/400001 [00:00<01:24, 4700.19it/s][A
  1%|          | 4709/400001 [00:01<01:23, 4708.54it/s][A
  1%|▏         | 5182/400001 [00:01<01:23, 4713.69it/s][A
  1%|▏         | 5645/400001 [00:01<01:24, 4654.12it/s][A
  2%|▏         | 6119/400001 [00:01<01:24, 4679.13it/s][A
  2%|▏         | 6592/400001 [00:01<01:23, 4693.17it/s][A
  2%|▏         | 7065/400001 [00:01<01:23, 4702.85it/s][A
  2%|▏         | 7537/400001 [00:01<01:23, 4707.57it/s][A
  2%|▏     

 32%|███▏      | 128042/400001 [00:27<00:57, 4712.58it/s][A
 32%|███▏      | 128517/400001 [00:27<00:57, 4721.11it/s][A
 32%|███▏      | 128990/400001 [00:27<00:57, 4722.50it/s][A
 32%|███▏      | 129463/400001 [00:27<00:57, 4719.50it/s][A
 32%|███▏      | 129935/400001 [00:27<00:58, 4632.72it/s][A
 33%|███▎      | 130406/400001 [00:27<00:57, 4654.28it/s][A
 33%|███▎      | 130879/400001 [00:28<00:57, 4674.45it/s][A
 33%|███▎      | 131347/400001 [00:28<00:57, 4661.85it/s][A
 33%|███▎      | 131819/400001 [00:28<00:57, 4678.21it/s][A
 33%|███▎      | 132291/400001 [00:28<00:57, 4688.33it/s][A
 33%|███▎      | 132760/400001 [00:28<00:57, 4680.03it/s][A
 33%|███▎      | 133232/400001 [00:28<00:56, 4690.11it/s][A
 33%|███▎      | 133705/400001 [00:28<00:56, 4700.29it/s][A
 34%|███▎      | 134177/400001 [00:28<00:56, 4703.86it/s][A
 34%|███▎      | 134648/400001 [00:28<00:56, 4699.95it/s][A
 34%|███▍      | 135119/400001 [00:28<00:56, 4684.53it/s][A
 34%|███▍      | 135589/

 64%|██████▎   | 254135/400001 [00:54<00:31, 4703.05it/s][A
 64%|██████▎   | 254606/400001 [00:54<00:30, 4703.36it/s][A
 64%|██████▍   | 255079/400001 [00:54<00:30, 4710.77it/s][A
 64%|██████▍   | 255551/400001 [00:54<00:30, 4701.37it/s][A
 64%|██████▍   | 256022/400001 [00:54<00:30, 4696.22it/s][A
 64%|██████▍   | 256492/400001 [00:54<00:30, 4693.26it/s][A
 64%|██████▍   | 256962/400001 [00:55<00:30, 4686.15it/s][A
 64%|██████▍   | 257431/400001 [00:55<00:30, 4675.73it/s][A
 64%|██████▍   | 257899/400001 [00:55<00:30, 4664.19it/s][A
 65%|██████▍   | 258366/400001 [00:55<00:30, 4660.72it/s][A
 65%|██████▍   | 258833/400001 [00:55<00:30, 4659.20it/s][A
 65%|██████▍   | 259299/400001 [00:55<00:30, 4654.35it/s][A
 65%|██████▍   | 259765/400001 [00:55<00:30, 4654.12it/s][A
 65%|██████▌   | 260233/400001 [00:55<00:29, 4660.63it/s][A
 65%|██████▌   | 260701/400001 [00:55<00:29, 4666.07it/s][A
 65%|██████▌   | 261169/400001 [00:55<00:29, 4669.28it/s][A
 65%|██████▌   | 261644/

 95%|█████████▍| 379410/400001 [01:21<00:04, 4729.04it/s][A
 95%|█████████▍| 379888/400001 [01:21<00:04, 4744.01it/s][A
 95%|█████████▌| 380365/400001 [01:21<00:04, 4751.30it/s][A
 95%|█████████▌| 380841/400001 [01:21<00:04, 4746.53it/s][A
 95%|█████████▌| 381316/400001 [01:21<00:03, 4711.20it/s][A
 95%|█████████▌| 381788/400001 [01:22<00:04, 4475.55it/s][A
 96%|█████████▌| 382252/400001 [01:22<00:03, 4523.00it/s][A
 96%|█████████▌| 382725/400001 [01:22<00:03, 4581.90it/s][A
 96%|█████████▌| 383187/400001 [01:22<00:03, 4590.68it/s][A
 96%|█████████▌| 383657/400001 [01:22<00:03, 4622.93it/s][A
 96%|█████████▌| 384129/400001 [01:22<00:03, 4649.72it/s][A
 96%|█████████▌| 384603/400001 [01:22<00:03, 4675.38it/s][A
 96%|█████████▋| 385075/400001 [01:22<00:03, 4687.77it/s][A
 96%|█████████▋| 385549/400001 [01:22<00:03, 4700.91it/s][A
 97%|█████████▋| 386023/400001 [01:22<00:02, 4712.47it/s][A
 97%|█████████▋| 386495/400001 [01:23<00:02, 4694.17it/s][A
 97%|█████████▋| 386965/

In [48]:
wordVectors['_UNK'] = wordVectors['unk']
for word in word2idx:
    if word in wordVectors:
        vectors.append(wordVectors[word])
    else:
        vectors.append(wordVectors['_UNK'])
        print(word)

_PAD
..
slut-shaming
ikhtiyaar
.weheal
youve
shouldnt
leoratan.com
womxn
.account
.x
//
-he
.mlm
4.
stickers*
*selected
f*ck
||
sivememes
hahaha
+male
blackwomanvibes
shayeris
y'all
jerah
sanuco
-she
tshirt
dress-coded
.fsf
ejiro
.i
2019.
setelan
w/
.model
ok.
evulving.com
mansplaining
call/
0240801700
bernies
crysty
lifestye
patreon
9560336156
slut-shamed
unfollow
harga
.official
feminisim
lgbtqia
hadnt
panikonga
ungala
panirtha
thappu
gaslighting
.x.official
.s
storycomment
'not
favs
aubrie
.post
*if
igtv
f*ckboi
f*ckin
.didit
kahaan
'all
werent
paytm
alices
-pants
-bust
8.
assault/harassment/rape
itll
non-binary
inktober
chisom
6.
7.
'slut
trumpty
brexit
2015.
.wearstyle
bungle.krungle
~~~~~
.memeos
guys~~~~~~~~~~~~~~
.rights.feminists
dress-coding
selfies
tireport
him-
.of.odisha
yqyg
..is
catcalled
f*cking
tafiri
'if
linktree
post-
tweetchat
selfie
tbh
cwjobs
.mcg
.france
.world
sexualising
shamers
-60cm/23.62
-52cm/20.47
36cm/14.17
-38cm/14.96
vestimos
non-monogamy
.comment.secti

scrunchiess
650/-
m*an
g*rls
r*gina
sorry..
.wlw
20.10.2k19
20/10/2019
womxns
prestigiousness
'young
.emadi
.boyd.33
demeanours
some1
****
urselves
sadventure
basketcase
.a
vtours
yaaalll
bias-
16.
shutterstock
spunjbub
unilag
17.
'skipped
idk
.silencee
.smokeyy
.moonss
.judgmentt
.prevailss
.skulls
men..
breakup..then
men..girls
tipdo
.someone
selca
hanlim
.junho
over-sexualised
.hooray
author/musical
ceo/cofounder
weunlearn
contact.org
meghna.org
.saedirad
fnck
situationship
.a.white
.thank
+905364842891
~tiyana
catcall
auce
'geli
'alf
raubal
gelis
osirus
natashas
godsister
boysu
cyber-bullied
fat-shaming
skinny-shaming
writing/editing
especislly
ketaki
miocardialinferction
braincontusion
kazuosan
jyoji
newzealand
voldman
onahole
vaginahole
no4
no5
300more
mernpunk
muah
.pribisova
killstreak
you-
+rp
keontria
ziegenhorn
rdf-
14th-storey
uterly
diffferent
willexplore
doppiosenso
diamondsand
denhollander
jesus-speak
awarness
imageny
norskis
world-changing
feetish
eyworld
repostby
'ador


### Sentence to model input

In [1]:
def pad_data(s,maxlen):
        padded = np.zeros((maxlen,), dtype=np.int64)
        if len(s) > maxlen: padded[:] = s[:maxlen]
        else: padded[:len(s)] = s
        return padded

def sentToTensor(text,word2idx,vectors):    
    padded_vector = pad_data(sent2idx(tokenizeText(text)),10)
    
    return torch.tensor(padded_vector).reshape(1,-1)

In [58]:
sentToTensor('I am a test sentence.',word2idx,vectors)

tensor([[-0.1329,  0.1699, -0.1436,  ..., -0.2378,  0.1477,  0.6290],
        [ 0.4143, -0.1587,  0.4222,  ...,  0.2209,  0.3912,  0.5694],
        [-0.2971,  0.0940, -0.0967,  ...,  0.0597, -0.2285,  0.2960],
        ...,
        [ 0.3007, -0.4687, -0.2062,  ...,  0.4927, -0.1128, -0.2777],
        [ 0.3007, -0.4687, -0.2062,  ...,  0.4927, -0.1128, -0.2777],
        [ 0.3007, -0.4687, -0.2062,  ...,  0.4927, -0.1128, -0.2777]],
       dtype=torch.float64)

# Extrapolating to MultiClass Problem

In [2]:
class VectorizeDataMultiClass(Dataset):
    def __init__(self, df, maxlen=10):
        self.maxlen = maxlen
        self.df = df
        self.df['text_padded'] = self.df.vectorized.apply(lambda x: self.pad_data(x))
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        text = self.df.text_padded.values[idx]
        sexism_label = self.df.scores.values[idx]
        sexism_type = self.df['class'].values[idx]
        
        if sexism_label == 0 and sexism_type == 0:
            return text,0
        if sexism_label == 1 and sexism_type == 1:
            return text,1
        if sexism_label == 1 and sexism_type == 2:
            return text,2
            
    def pad_data(self, s):
        padded = np.zeros((self.maxlen,), dtype=np.int64)
        if len(s) > self.maxlen: padded[:] = s[:self.maxlen]
        else: padded[:len(s)] = s
        return padded

NameError: name 'Dataset' is not defined

In [None]:
trainDatasetMC = VectorizeDataMultiClass(train_data)
testDatasetMC = VectorizeDataMultiClass(test_data)
trainLoaderMC = DataLoader(dataset=trainDatasetMC, batch_size=100, shuffle=True)
testLoaderMC = DataLoader(dataset=testDatasetMC, batch_size=100, shuffle=False)

In [None]:
print('Multiclass data')

In [None]:
for i, samples in enumerate(trainLoader):
    print(i)
    print(samples[0])
    print(samples[1])
    print(samples[2])
    break