In [1]:
from tqdm import tqdm, tqdm_notebook
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from collections import Counter
import numpy as np

In [2]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as dsets
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.optim as optim
import torch.nn.functional as F

In [3]:
from nltk.tokenize import sent_tokenize, word_tokenize 
import math
import pandas

## Loading Data

In [4]:
data = pandas.read_csv('./../sexism-data.csv')

In [5]:
# new_data=data[data['scores']==1]

In [6]:
# new_data

In [7]:
# temp_data=data[data['scores']==0]
# temp_data=temp_data[0:500]

In [8]:
# data=new_data.append(temp_data)

In [9]:
data

Unnamed: 0,texts,scores,class
0,"So begin today. Each criticism that you face, ...",0,0
1,It's Sunday. Y'all dirty minded people should ...,0,0
2,Whoever decided the phrase smash your backdoor...,0,0
3,Trying to convince Adam to make me some flower...,1,2
4,i wore this dress last year but now its too sh...,0,0
5,Double standards can exist anywhere. Lets fix ...,0,0
6,"Before Sarah lost her phone, purse & memory",0,0
7,Women have to work much harder to make it in t...,1,2
8,my biooz BNTGU JC SUD IGBOA RMH FETN BTPKJ CBM...,0,0
9,Whenever my daughter and I talk about marriage...,1,2


In [10]:
train_data, test_data = train_test_split(data)

In [11]:
words = Counter()
word2idx = {}
idx2word = {}

def tokenizeText(sentence):
    tokens = word_tokenize(sentence)
    
    return tokens

def sent2idx(split_text):
    sent2idx = []
    for w in split_text:
        if w.lower() in word2idx:
            sent2idx.append(word2idx[w.lower()])
        else:
            sent2idx.append(word2idx['_UNK'])
            
    return sent2idx

def processTextData(df,isTrain):
    global words
    global word2idx
    global idx2word
    df = df.copy()
    
    df['tokenized'] = df.texts.apply(lambda x: (tokenizeText(x.lower())))
    
    if isTrain:
        for sent in tqdm(df.tokenized.values):
            words.update(w for w in sent)

        words = sorted(words, key=words.get, reverse=True)
        words = ['_PAD','_UNK'] + words

        word2idx = {o:i for i,o in enumerate(words)}
        idx2word = {i:o for i,o in enumerate(words)}
        
    df['vectorized'] = df.texts.apply(lambda x: sent2idx(x))
    
    return df

In [12]:
train_data = processTextData(train_data,True)
test_data = processTextData(test_data,False)

100%|██████████| 4158/4158 [00:00<00:00, 61544.68it/s]


In [13]:
def label(score):
    l=[0,0]
    l[score]=1
    return l

In [14]:
train_data['label']=train_data['scores'].apply(label)
test_data['label']=test_data['scores'].apply(label)

In [15]:
train_data

Unnamed: 0,texts,scores,class,tokenized,vectorized,label
2266,Still being surprised in 2019 about things dar...,0,0,"[still, being, surprised, in, 2019, about, thi...","[362, 1204, 8, 817, 817, 1, 1283, 791, 8, 892,...","[1, 0]"
5291,I am not perfect. I make mistakes. But when I ...,0,0,"[i, am, not, perfect, ., i, make, mistakes, .,...","[8, 1, 7, 954, 1, 892, 908, 1204, 1, 2721, 791...","[1, 0]"
281,"An Indian medical practitioner, social reforme...",0,0,"[an, indian, medical, practitioner, ,, social,...","[7, 892, 1, 8, 892, 1367, 8, 7, 892, 1, 954, 7...","[1, 0]"
4890,"The hindi text roughly translates as ""Neither ...",1,2,"[the, hindi, text, roughly, translates, as, ``...","[1204, 1780, 791, 1, 1780, 8, 892, 1367, 8, 1,...","[0, 1]"
4820,"No, its not a gag to cast a man as a woman, . ...",0,0,"[no, ,, its, not, a, gag, to, cast, a, man, as...","[892, 908, 3, 1, 8, 1204, 362, 1, 892, 908, 12...","[1, 0]"
2427,She Aint Fat Bruh...She Just A Lil Ticc!,0,0,"[she, aint, fat, bruh, ..., she, just, a, lil,...","[362, 1780, 791, 1, 7, 8, 892, 1204, 1, 3509, ...","[1, 0]"
5163,So let's open the window to ourselves. IKHTIYAAR,0,0,"[so, let, 's, open, the, window, to, ourselves...","[362, 908, 1, 817, 791, 1204, 173, 362, 1, 908...","[1, 0]"
5268,Did you miss the Speak Their Name event earlie...,0,0,"[did, you, miss, the, speak, their, name, even...","[1367, 8, 1367, 1, 1158, 908, 454, 1, 954, 8, ...","[1, 0]"
419,September 4. Shock Me Shirts Walk proudly whil...,0,0,"[september, 4., shock, me, shirts, walk, proud...","[362, 791, 2721, 1204, 791, 954, 1283, 791, 11...","[1, 0]"
2098,Well look what the cat dragged in!,0,0,"[well, look, what, the, cat, dragged, in, !]","[4894, 791, 817, 817, 1, 817, 908, 908, 2990, ...","[1, 0]"


In [16]:
class VectorizeData(Dataset):
    def __init__(self, df, maxlen=10):
        self.maxlen = maxlen
        self.df = df
        self.df['text_padded'] = self.df.vectorized.apply(lambda x: self.pad_data(x))
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        text = self.df.text_padded.values[idx]
        sexism_label = self.df.label.values[idx]
        sexism_type = self.df['class'].values[idx]
        return text,sexism_label,sexism_type
    
    def pad_data(self, s):
        padded = np.zeros((self.maxlen,), dtype=np.int64)
        if len(s) > self.maxlen: padded[:] = s[:self.maxlen]
        else: padded[:len(s)] = s
        return padded

In [17]:
trainDataset = VectorizeData(train_data)
testDataset = VectorizeData(test_data)

In [18]:
trainLoader = DataLoader(dataset=trainDataset, batch_size=100, shuffle=True)
testLoader = DataLoader(dataset=testDataset, batch_size=100, shuffle=False)

In [19]:
for i, samples in enumerate(trainLoader):
    print(i)
    print(samples[1])
    break

0
[tensor([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
        1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
        1, 1, 1, 1]), tensor([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
        0, 0, 0, 0])]


In [20]:
vectors = []

In [23]:
wordVectors = {}
with open("./glove.6B.300d.txt", 'rb') as f:
    for l in tqdm(f,total=400001):
        line = l.decode().split()
        word = line[0]
        
        vect = np.array(line[1:]).astype(np.float)
        wordVectors[word] = vect

100%|██████████| 400001/400001 [01:26<00:00, 4630.20it/s]


In [24]:
wordVectors['_UNK'] = wordVectors['unk']
for word in word2idx:
    if word in wordVectors:
        vectors.append(wordVectors[word])
    else:
        vectors.append(wordVectors['_UNK'])
        print(word)

passor
cc-curl
aludhukonde
*link
girlfriend/wife
alabama-birmingham
metoo
swishhhhhhhhh
favs
advice-
nylff
14.
tacenda
women/femmes
fuccin
77771799
rudegyal
-100-110cm/39.37
-take-it-anymore
watsup
cyberflashing
did'nt
girlslook
canallas
sexxxpertease
'real
aluvar
ankle/foot
playwright/poet
wrong..
certian
'dirty
1*shorts
kyokuformen
pd/ambulance
3tsp
memeatic
160x
xdbrp
theilluminatemvmnt
-60cm/23.62
upps
106.000
termin
thoughts/opinions
*freeshipping*
.conservative
verison
maleys
credibile
cx\rjf
feminazi
preboards
-38cm/14.96
15-34
rakhati
nilanjana
stilettos/'party
ciwr
*girl
.with
stephanies
pitaji
breven
sapnon
************
nbody
mid-thigh
gaslighting
rachel.kiddlevy
pre-shoot
berumur
edumay
*prize
yknow
t-swift
vngi
^kxwgy
.mama.streisand
musicvideo/microfilm
perverses
follow/
diagnosis/prognosis
lomdi
xxl-47
-45cm/17.71
day'to
newtoinstagram
skinny-shaming
doveri
ithst
desexualization
warfa
kalkha
overrrrrrrrrrrrrr
'someone
weponized
no4
eyes..
spizn
h.k
cenarios
dnn
'bloody
pa

'woman
deportados
wolf-whistle
f**k
nhbbwt
14th-storey
avere
seksuologen
tlabto
finger-licking-good
self-blame
bitch..
wouldve
whyvive.com
ymvgy^
*hurt*
.lamborghini
entsprach
.0official
ligma
jangid
.x
sodding
spedizioni24/48
53cm/20.9
.inc
thanx
aaaagggghhhhhh
***trigger
outfitday
***update***
molte
vmrshz
20181
formular
misgendered
elpgbi
nahhhh
automations
pornifed
parts~all
stereotypen
'was
lool
vulnerabilitys
hwth
'what
nah'sun
stehst
tag/dm
urlare
themen
babygros
millenial
fifa18
abuse/assault
escort/prostitute
josafat
dupptta*
_PAD
alienators
engelan
13.
oppressionthe
slutwalk
luenell
mensvoguemart.blogspot.com
womenwomen
hardik
*knew*
6364809777
bjp/rss
zosia
raped/molested/sexually/physically
disordered/psychopath/
quedes
2070-899
undone.the
.lukaniuk
micro-inequities
futile..
ya.stacey
2008.
.bitches
.mobeck
*continues
.burdge
popn
well-baby
diyan
andaaz
hanson-young
.bcn
management/leaders
32cm/12.59
40cm/15.74
girlchild
pheminist
thequeuellc.com
15.sec
gurrrrl
sexify
atrib


### Sentence to model input

In [25]:
def pad_data(s,maxlen):
        padded = np.zeros((maxlen,), dtype=np.int64)
        if len(s) > maxlen: padded[:] = s[:maxlen]
        else: padded[:len(s)] = s
        return padded

def sentToTensor(text,word2idx,vectors):    
    padded_vector = pad_data(sent2idx(tokenizeText(text)),10)
    
    return torch.tensor(padded_vector).reshape(1,-1)

In [26]:
sentToTensor('I am a test sentence.',word2idx,vectors)

tensor([[   8,   95,    7, 2141, 2273,    2,    0,    0,    0,    0]])

# Extrapolating to MultiClass Problem

In [27]:
class VectorizeDataMultiClass(Dataset):
    def __init__(self, df, maxlen=10):
        self.maxlen = maxlen
        self.df = df
        self.df['text_padded'] = self.df.vectorized.apply(lambda x: self.pad_data(x))
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        text = self.df.text_padded.values[idx]
        sexism_label = self.df.scores.values[idx]
        sexism_type = self.df['class'].values[idx]
        
        if sexism_label == 0 and sexism_type == 0:
            return text,0
        if sexism_label == 1 and sexism_type == 1:
            return text,1
        if sexism_label == 1 and sexism_type == 2:
            return text,2
            
    def pad_data(self, s):
        padded = np.zeros((self.maxlen,), dtype=np.int64)
        if len(s) > self.maxlen: padded[:] = s[:self.maxlen]
        else: padded[:len(s)] = s
        return padded

In [28]:
trainDatasetMC = VectorizeDataMultiClass(train_data)
testDatasetMC = VectorizeDataMultiClass(test_data)
trainLoaderMC = DataLoader(dataset=trainDatasetMC, batch_size=100, shuffle=True)
testLoaderMC = DataLoader(dataset=testDatasetMC, batch_size=100, shuffle=False)

In [29]:
print('Multiclass data')

Multiclass data


In [30]:
for i, samples in enumerate(trainLoader):
    print(i)
    print(samples[0])
    print(samples[1])
    print(samples[2])
    break

0
tensor([[ 362,  908,    1,  817,  791, 1204,  173,  362,    1,  954],
        [3509,  908, 1113,    1, 1204, 1780,  791,    1, 1204,    8],
        [ 362,  817,  454, 1204,    1,  362, 1780,    7,  954,    8],
        [1158,  908,  454,    1, 1606,    7,  892,    1, 1283,  791],
        [1606,  791, 1113,  791,    7,  817,  362,    1,    7, 1113],
        [   7,  817,  817,    1, 1113,    8, 7251, 1780, 1204,  362],
        [ 908,  954, 7251,    1, 3509, 1113,   15,    1,   17,    1],
        [4894, 1780,    7, 1204,    1,    7, 1113,  791,    1, 1158],
        [ 908,  892,  817, 1158,    1, 4894,    7,  892, 1204,  362],
        [3509, 1113,  908,  954,    1, 4894,    8,  892,  892,    8],
        [ 954,  791,  791, 1204,    1,  954, 1158,    1, 1283,    7],
        [1780,  908, 4894,    1, 1204, 1780,  791,    1, 1780,  791],
        [ 362,  908,    1,  362, 1204, 1158,  817,  791,    1,    2],
        [1204, 1113,    7,  892,  362,  817,    7, 1204,    8,  892],
        [ 817,  90