In [1]:
import pandas as pd
import numpy as np
import torch
from torchtext.data import Field
from torchtext.data import TabularDataset
from torchtext.data import Iterator, BucketIterator
import torch.autograd as autograd
import tqdm

In [2]:
from csv_handle import save_csv

In [19]:
source_file="data/facebook.csv"
version = "_fb"
train_filename, valid_filename, test_filename = save_csv(source_file=source_file, version=version, k=50000)

In [20]:
tokenize = lambda x: x.split()
TEXT = Field(sequential=True, tokenize=tokenize, lower=True)
LABEL = Field(sequential=False, use_vocab=False)

In [21]:
tv_datafields = [("comment_text", TEXT),
                 ("gender", LABEL)]

trn, vld = TabularDataset.splits(
        path="", # the root directory where the data lies
        train=train_filename, validation=valid_filename,
        format='csv',
        skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
        fields=tv_datafields)

tst_datafields = [("comment_text", TEXT),
                 ("gender", LABEL)]

tst = TabularDataset(
        path=test_filename, # the file path
        format='csv',
        skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
        fields=tst_datafields)

In [22]:
TEXT.build_vocab(trn, vectors="glove.6B.100d")

In [23]:
vocab = TEXT.vocab
vocab.freqs.most_common(10)

[('the', 25517),
 ('to', 20234),
 ('and', 15195),
 ('you', 13843),
 ('a', 12721),
 ('of', 11757),
 ('in', 11419),
 ('for', 9584),
 ('i', 8747),
 ('is', 8257)]

In [24]:
train_iter, val_iter = BucketIterator.splits(
        (trn, vld), # we pass in the datasets we want the iterator to draw data from
        batch_sizes=(64, 64),
        device=-1, # if you want to use the GPU, specify the GPU number here
        sort_key=lambda x: len(x.comment_text), # the BucketIterator needs to be told what function it should use to group the data.
        sort_within_batch=False,
        repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)

test_iter = Iterator(tst, batch_size=64, device=-1, sort=False, sort_within_batch=False, repeat=False)

The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np
import random
import torch.nn.init as init

In [26]:
class CNN_Text(nn.Module):
    
    def __init__(self, embed_num, embed_dim=100, class_num=2, kernel_num=200, kernel_sizes=[1,2,3,4]):
        super(CNN_Text, self).__init__()
        
        V = embed_num
        D = embed_dim
        C = class_num
        Ci = 1
        Co = kernel_num
        Ks = kernel_sizes

        
        self.embed = nn.Embedding(V, D, scale_grad_by_freq=True, padding_idx=1)
        
        #self.embed.weight.data.copy_(args.pretrained_weight)
        self.embed.weight.requires_grad = True
        print("dddd {} ".format(self.embed.weight.data.size()))

        print("using wide convolution")
        self.convs1 = [nn.Conv2d(in_channels=Ci, out_channels=Co, kernel_size=(K, D), stride=(1, 1),
                                     padding=(K//2, 0), dilation=1, bias=False) for K in Ks]
        print(self.convs1)

        self.dropout = nn.Dropout(p=0.5)
        self.dropout_embed = nn.Dropout(p=0.5)
        in_fea = len(Ks) * Co
        self.fc = nn.Linear(in_features=in_fea, out_features=C, bias=True)

    def forward(self, x):
        x = self.embed(x)  # (N,W,D)
        x = self.dropout_embed(x)
        x = x.unsqueeze(1)  # (N,Ci,W,D)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] #[(N,Co,W), ...]*len(Ks)
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] #[(N,Co), ...]*len(Ks)
        x = torch.cat(x, 1)
        x = self.dropout(x)  # (N,len(Ks)*Co)
        
        logit = self.fc(x)
        
        return logit

In [27]:
class BiGRU(nn.Module):
    
    def __init__(self, embed_num, lstm_hidden_dim=200, lstm_num_layers=1, embed_dim=100, class_num=2):
        super(BiGRU, self).__init__()
        
        
        self.hidden_dim = lstm_hidden_dim
        self.num_layers = lstm_num_layers
        V = embed_num
        D = embed_dim
        C = class_num


        self.embed = nn.Embedding(V, D, padding_idx=1)
        # pretrained  embedding
        #self.embed.weight.data.copy_(args.pretrained_weight)
        # gru
        self.bigru = nn.GRU(D, self.hidden_dim, dropout=0.6, num_layers=self.num_layers, bidirectional=True)
        # linear
        self.hidden2label = nn.Linear(self.hidden_dim * 2, C)
        #  dropout
        self.dropout = nn.Dropout(p=0.6)

    def forward(self, input):
        embed = self.embed(input)
        embed = self.dropout(embed)
        input = embed.view(len(input), embed.size(1), -1)
        # gru
        gru_out, _ = self.bigru(input)
        gru_out = torch.transpose(gru_out, 0, 1)
        gru_out = torch.transpose(gru_out, 1, 2)
        # pooling
        # gru_out = F.tanh(gru_out)
        gru_out = F.max_pool1d(gru_out, gru_out.size(2)).squeeze(2)
        gru_out = F.tanh(gru_out)
        # linear
        y = self.hidden2label(gru_out)
        logit = y
        return logit

In [28]:
class BiLSTM(nn.Module):
    def __init__(self, hidden_dim, emb_dim=100,
                 spatial_dropout=0.05, recurrent_dropout=0.1, num_linear=1):
        super().__init__() # don't forget to call this!
        self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim)
        #self.embedding.weight.data.copy_(vocab.vectors)
        
        #print (self.embedding)
        
        self.encoder = nn.LSTM(emb_dim, hidden_dim, num_layers=1, dropout=recurrent_dropout)
        self.linear_layers = []
        for _ in range(num_linear - 1):
            self.linear_layers.append(nn.Linear(hidden_dim, hidden_dim))
        self.linear_layers = nn.ModuleList(self.linear_layers)
        self.predictor = nn.Linear(hidden_dim, 2)
    
    def forward(self, seq):
        hdn, _ = self.encoder(self.embedding(seq))
        feature = hdn[-1, :, :]
        for layer in self.linear_layers:
            feature = layer(feature)
        preds = self.predictor(feature)
        return preds

In [None]:
model = CNN_Text(len(vocab))
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-8)
epochs = 10
is_CNN = True

In [30]:
model = BiGRU(len(vocab))
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-8)
epochs = 10
is_CNN = False

  "num_layers={}".format(dropout, num_layers))


In [None]:
model = BiLSTM(len(vocab))
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-8)
epochs = 10
is_CNN = False

In [31]:
for epoch in range(1, epochs+1):
    steps = 0
    running_loss = 0.0
    
    print("\n## The {} Epoch, All {} Epochs ! ##".format(epoch, epochs))
    model.train()
    for batch in tqdm.tqdm(train_iter):
        feature, target = batch.comment_text, batch.gender
        
        if is_CNN:
            feature.data.t_()
        else:
            target = autograd.Variable(target)

        optimizer.zero_grad()
        # model.zero_grad()

        logit = model(feature)
        F.cross_entropy(logit, target)
        loss = F.cross_entropy(logit, target)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.data.item() * feature.size(0)
        
    epoch_loss = running_loss / len(trn)
    val_loss = 0.0
    model.eval() # turn on evaluation mode
    for batch in tqdm.tqdm(val_iter):
        x, y = batch.comment_text, batch.gender
        preds = model(x)
        loss = F.cross_entropy(preds, y)
        val_loss += loss.data.item() * x.size(0)

    val_loss /= len(vld)
    print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss))
        


  0%|          | 0/469 [00:00<?, ?it/s]


## The 1 Epoch, All 10 Epochs ! ##


  2%|▏         | 8/469 [00:19<18:51,  2.45s/it]  

KeyboardInterrupt: 

In [14]:
pairs = []
for batch in tqdm.tqdm(test_iter):
    x, y = batch.comment_text, batch.gender
    x.data.t_()
    preds = model(x)
    preds = preds.data.numpy()
    
    pair = [(np.argmax(item), v.item()) for item, v in zip(preds, y)]
    pairs.extend(pair)

100%|██████████| 16/16 [00:04<00:00,  3.33it/s]


In [18]:
pairs

[(0, 1),
 (0, 0),
 (1, 1),
 (0, 1),
 (0, 0),
 (0, 1),
 (0, 1),
 (0, 0),
 (0, 1),
 (0, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (0, 0),
 (0, 0),
 (1, 0),
 (0, 1),
 (0, 1),
 (0, 0),
 (0, 0),
 (0, 0),
 (1, 1),
 (1, 0),
 (0, 0),
 (0, 0),
 (1, 1),
 (0, 0),
 (1, 0),
 (1, 1),
 (0, 1),
 (0, 1),
 (1, 0),
 (1, 0),
 (0, 0),
 (1, 0),
 (1, 0),
 (1, 1),
 (0, 1),
 (1, 1),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 1),
 (0, 0),
 (0, 0),
 (0, 1),
 (1, 1),
 (0, 0),
 (0, 1),
 (1, 0),
 (0, 0),
 (1, 1),
 (0, 1),
 (0, 0),
 (0, 1),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 1),
 (0, 1),
 (1, 0),
 (0, 0),
 (0, 0),
 (0, 1),
 (0, 1),
 (0, 0),
 (0, 1),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 1),
 (0, 0),
 (0, 1),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 1),
 (0, 1),
 (0, 0),
 (0, 0),
 (1, 0),
 (0, 0),
 (0, 0),
 (0, 1),
 (1, 1),
 (0, 0),
 (1, 1),
 (0, 0),
 (0, 1),
 (0, 1),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 1),
 (0, 1),
 (0, 0),
 (0, 1),
 (1, 0),
 (0, 0),
 (0, 1),
 

In [15]:
sum([(1 if (x==y) else 0) for x,y in [((1 if (x>0.5) else 0), y) for x, y in pairs]])/len(pairs)

0.5070140280561122

In [16]:
pairs = []
for batch in tqdm.tqdm(train_iter):
    x, y = batch.comment_text, batch.gender
    x.data.t_()
    preds = model(x)
    preds = preds.data.numpy()
    
    pair = [(np.argmax(item), v.item()) for item, v in zip(preds, y)]
    pairs.extend(pair)

100%|██████████| 47/47 [00:11<00:00,  4.21it/s]


In [17]:
sum([(1 if (x==y) else 0) for x,y in [((1 if (x>0.5) else 0), y) for x, y in pairs]])/len(pairs)

0.5148580968280467