In [1]:
from tqdm import tqdm, tqdm_notebook
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from collections import Counter
import numpy as np

In [112]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as dsets
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.optim as optim
import torch.nn.functional as F

In [4]:
from nltk.tokenize import sent_tokenize, word_tokenize 
import math
import pandas

## Loading Data

In [5]:
data = pandas.read_csv('./../sexism-data.csv')

In [137]:
new_data=data[data['scores']==1]

In [138]:
new_data

Unnamed: 0,texts,scores,class
3,Trying to convince Adam to make me some flower...,1,2
7,Women have to work much harder to make it in t...,1,2
9,Whenever my daughter and I talk about marriage...,1,2
10,'chef Heston Blumenthal has claimed that femal...,1,1
20,Ushna shah the Pakistani superstar caught hers...,1,2
...,...,...,...
5332,SLUT How would you define the word? This four ...,1,2
5336,"Slut shaming can happen to anyone, whether the...",1,2
5431,Bloody men are like buses- You wait for about ...,1,2
5462,"To the men that tell women to smile more, ligh...",1,2


In [108]:
temp_data=data[data['scores']==0]
temp_data=temp_data[0:500]

In [109]:
data=new_data.append(temp_data)

In [115]:
data

Unnamed: 0,texts,scores,class
0,"So begin today. Each criticism that you face, ...",0,0
1,It's Sunday. Y'all dirty minded people should ...,0,0
2,Whoever decided the phrase smash your backdoor...,0,0
3,Trying to convince Adam to make me some flower...,1,2
4,i wore this dress last year but now its too sh...,0,0
...,...,...,...
5539,Follow us for more Feminist stories & quotes. . .,0,0
5540,rich white kids - - - - - - - - - - sivememes,0,0
5541,Got rid of the drabby brown and those shiney r...,0,0
5542,piggy longevily starred at me twice as I passe...,0,0


In [116]:
train_data, test_data = train_test_split(data)

In [117]:
words = Counter()
word2idx = {}
idx2word = {}

def tokenizeText(sentence):
    tokens = word_tokenize(sentence)
    
    return tokens

def sent2idx(split_text):
    sent2idx = []
    for w in split_text:
        if w.lower() in word2idx:
            sent2idx.append(word2idx[w.lower()])
        else:
            sent2idx.append(word2idx['_UNK'])
            
    return sent2idx

def processTextData(df,isTrain):
    global words
    global word2idx
    global idx2word
    df = df.copy()
    
    df['tokenized'] = df.texts.apply(lambda x: (tokenizeText(x.lower())))
    
    if isTrain:
        for sent in tqdm(df.tokenized.values):
            words.update(w for w in sent)

        words = sorted(words, key=words.get, reverse=True)
        words = ['_PAD','_UNK'] + words

        word2idx = {o:i for i,o in enumerate(words)}
        idx2word = {i:o for i,o in enumerate(words)}
        
    df['vectorized'] = df.texts.apply(lambda x: sent2idx(x))
    
    return df

In [118]:
train_data = processTextData(train_data,True)
test_data = processTextData(test_data,False)

100%|██████████| 4158/4158 [00:00<00:00, 80125.68it/s]


In [119]:
def label(score):
    l=[0,0]
    l[score]=1
    return l

In [120]:
train_data['label']=train_data['scores'].apply(label)
test_data['label']=test_data['scores'].apply(label)

In [122]:
train_data

Unnamed: 0,texts,scores,class,tokenized,vectorized,label
4449,"If youre not calling yourself a feminist, aka ...",0,0,"[if, youre, not, calling, yourself, a, feminis...","[8, 2616, 1, 1118, 1070, 445, 1115, 690, 1, 81...","[1, 0]"
4901,"So, your neighbor was left to raise kids all b...",0,0,"[so, ,, your, neighbor, was, left, to, raise, ...","[373, 1070, 3, 1, 1118, 1070, 445, 1115, 1, 81...","[1, 0]"
1640,"I dress in revealing clothes, I post 'sexy' se...",1,2,"[i, dress, in, revealing, clothes, ,, i, post,...","[8, 1, 1469, 1115, 690, 373, 373, 1, 8, 812, 1...","[0, 1]"
5338,"Mommy Shark, Daddy Shark, Baby Sharks",0,0,"[mommy, shark, ,, daddy, shark, ,, baby, sharks]","[966, 1070, 966, 966, 1118, 1, 373, 2674, 7, 1...","[1, 0]"
4659,"YouTuber, author & upcoming late night TV host...",0,0,"[youtuber, ,, author, &, upcoming, late, night...","[1118, 1070, 445, 1090, 445, 1958, 690, 1115, ...","[1, 0]"
...,...,...,...,...,...,...
2787,GbmtiF GU]RWk gIHSLw ]u[LJC tpe`VQ ryfhJH \upP...,0,0,"[gbmtif, gu, ], rwk, gihslw, ], u, [, ljc, tpe...","[4253, 1958, 966, 1090, 8, 2616, 1, 4253, 445,...","[1, 0]"
1369,Working to shine lights on Webs of patriarchy....,0,0,"[working, to, shine, lights, on, webs, of, pat...","[6370, 1070, 1115, 3176, 8, 812, 4253, 1, 1090...","[1, 0]"
2355,Mark your calendar for Dare You Take a Walk in...,0,0,"[mark, your, calendar, for, dare, you, take, a...","[966, 7, 1115, 3176, 1, 1118, 1070, 445, 1115,...","[1, 0]"
2229,My face when I leave school in less than month...,0,0,"[my, face, when, i, leave, school, in, less, t...","[966, 1118, 1, 2616, 7, 1415, 690, 1, 6370, 26...","[1, 0]"


In [123]:
class VectorizeData(Dataset):
    def __init__(self, df, maxlen=10):
        self.maxlen = maxlen
        self.df = df
        self.df['text_padded'] = self.df.vectorized.apply(lambda x: self.pad_data(x))
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        text = self.df.text_padded.values[idx]
        sexism_label = self.df.label.values[idx]
        sexism_type = self.df['class'].values[idx]
        return text,sexism_label,sexism_type
    
    def pad_data(self, s):
        padded = np.zeros((self.maxlen,), dtype=np.int64)
        if len(s) > self.maxlen: padded[:] = s[:self.maxlen]
        else: padded[:len(s)] = s
        return padded

In [124]:
trainDataset = VectorizeData(train_data)
testDataset = VectorizeData(test_data)

In [125]:
trainLoader = DataLoader(dataset=trainDataset, batch_size=100, shuffle=True)
testLoader = DataLoader(dataset=testDataset, batch_size=100, shuffle=False)

In [126]:
for i, samples in enumerate(trainLoader):
    print(i)
    print(samples[1])
    break

0
[tensor([1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
        1, 1, 1, 1]), tensor([0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0])]



### Sentence to model input

In [130]:
def pad_data(s,maxlen):
        padded = np.zeros((maxlen,), dtype=np.int64)
        if len(s) > maxlen: padded[:] = s[:maxlen]
        else: padded[:len(s)] = s
        return padded

def sentToTensor(text,word2idx,vectors):    
    padded_vector = pad_data(sent2idx(tokenizeText(text)),10)
    
    return torch.tensor(padded_vector).reshape(1,-1)

# Extrapolating to MultiClass Problem

In [132]:
class VectorizeDataMultiClass(Dataset):
    def __init__(self, df, maxlen=10):
        self.maxlen = maxlen
        self.df = df
        self.df['text_padded'] = self.df.vectorized.apply(lambda x: self.pad_data(x))
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        text = self.df.text_padded.values[idx]
        sexism_label = self.df.scores.values[idx]
        sexism_type = self.df['class'].values[idx]
        
        if sexism_label == 0 and sexism_type == 0:
            return text,0
        if sexism_label == 1 and sexism_type == 1:
            return text,1
        if sexism_label == 1 and sexism_type == 2:
            return text,2
            
    def pad_data(self, s):
        padded = np.zeros((self.maxlen,), dtype=np.int64)
        if len(s) > self.maxlen: padded[:] = s[:self.maxlen]
        else: padded[:len(s)] = s
        return padded

In [133]:
trainDatasetMC = VectorizeDataMultiClass(train_data)
testDatasetMC = VectorizeDataMultiClass(test_data)
trainLoaderMC = DataLoader(dataset=trainDatasetMC, batch_size=100, shuffle=True)
testLoaderMC = DataLoader(dataset=testDatasetMC, batch_size=100, shuffle=False)

In [134]:
print('Multiclass data')

Multiclass data


In [135]:
for i, samples in enumerate(trainLoader):
    print(i)
    print(samples[0])
    print(samples[1])
    print(samples[2])
    break

0
tensor([[ 6370,  2674,  1070,  1070,  1941,   373,     2,     1,     7,   373],
        [    8,  2616,     1,  1118,  1070,   445,  1115,   690,     1,  1070],
        [16686,   445,   373,  1090,     1,  1469,  1115,  1070,  1941,  1941],
        [ 2674,     7,  3683,     8,   812,  4253,     1,  1090,  2674,   690],
        [  812,  1070,     1,  1090,  1115,   445,  1415,  3176,   690,  1115],
        [ 3176,     7,  2674,     7,     7,   812,     1,  2674,   445,   966],
        [    8,     1,  6370,     7,   373,     1,  2616,  1070,  1115,     1],
        [    8,  1090,   373,     1,  1941,  1070,   373,   373,     8,  1958],
        [ 1415,  2674,   690,  1415,  3176,     1,  1070,   445,  1090,     1],
        [ 1070,   812,     1,  1090,  2674,     8,   373,     1,  6370,   690],
        [  926,   690,  1090,   171,   373,     1,  2616,     8,   812,  1469],
        [ 2674,   690,     7,  1115,  1090,  1958,  1115,   690,     7,  3176],
        [    8,     1,   926,     8,  