In [6]:
from data import data_dict, DIRECTORY
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from nltk.tokenize import word_tokenize
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import nltk
import torch
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from keras.preprocessing import sequence
import keras
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split


class PreprocessingDataset(Dataset):
    def __init__(self, file, root, x_col, y_col, meta_columns, label_idx = -1):
        self.x_col = x_col
        self.y_col = y_col
        self.data = pd.read_csv(file)
        self.data = self.data.sample(frac=1).reset_index(drop=True)
        self.data = self.data.drop(meta_columns, axis=1)

        # self.data, self.base_ref = self.tokenizer(self.data, [x_col])
        self.x_data = self.data[x_col]
        self.max_len = max([len(i) for i in self.x_data])

        self.x_data = self.word_vector(self.x_data)
        self.data[x_col] = [torch.LongTensor(i) for i in self.x_data]
        self.data = self.vectorize(self.data, [y_col])
        self.df_data = self.data
        self.data = self.data.to_numpy()

        self.root = root
        self.transform = transforms.Compose([transforms.ToTensor()])

    def format_text(self, token):
        clean_token = ''.join(chr for chr in token if chr.isalnum() and chr.isalpha())
        return clean_token

    def word_vector(self, data):
        x_data = data
        x_data = list(x_data)
        maximum_length = 0
        max_idx = 0
        for idx, i in enumerate(x_data):

            if len(i) > maximum_length:
                maximum_length = len(i)
                max_idx = idx
        
        t = Tokenizer()
        t.fit_on_texts(x_data)
        sequences = t.texts_to_sequences(x_data)
        sequences = keras.preprocessing.sequence.pad_sequences(sequences, maxlen=maximum_length)
        print(x_data[0])
        print(len(x_data[0]))
        print(sequences[0])

        return sequences


    def vectorize(self, data_inp, columns):
        data = data_inp
        for column in columns:
            labels = list(data[column].unique())
            ref = dict(zip(data[column].unique(), [i for i in range(len(labels))]))
            print(ref)
            for idx, val in enumerate(data[column]):
                vectorized = ref[data[column][idx]]
                data[column][idx] = torch.tensor(vectorized, dtype=float)
        return data

    def __len__ (self):
        return len(self.data)

    def __getitem__ (self, idx):
        
        self.transpose_data = self.data
        self.transpose_data = self.transpose_data.transpose()
        x_data = self.transpose_data[0]
        y_data = self.transpose_data[1]

        return x_data[idx], y_data[idx]

clean_truth_data = PreprocessingDataset(data_dict['politifact_clean'], DIRECTORY, 'statement', 'veracity', ['source', 'link'])

"Ninety percent of people born in the 1940s ended up doing better financially than their parents. But those born in the 1980s, the much-maligned Millennials, have only a 50-50 chance of doing better (financially) than their parents, despite being the best-educated generation in our history."
292
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    

In [7]:
clean_truth_data.df_data.head()

Unnamed: 0,statement,veracity
0,"[tensor(0), tensor(0), tensor(0), tensor(0), t...","tensor(0., dtype=torch.float64)"
1,"[tensor(0), tensor(0), tensor(0), tensor(0), t...","tensor(1., dtype=torch.float64)"
2,"[tensor(0), tensor(0), tensor(0), tensor(0), t...","tensor(2., dtype=torch.float64)"
3,"[tensor(0), tensor(0), tensor(0), tensor(0), t...","tensor(0., dtype=torch.float64)"
4,"[tensor(0), tensor(0), tensor(0), tensor(0), t...","tensor(3., dtype=torch.float64)"


In [8]:
BATCH_SIZE = 64

primary_data = clean_truth_data #secondary option of truth_data

train_len = int(len(primary_data)*0.8)
test_len = len(primary_data) - train_len

train_set, test_set = torch.utils.data.random_split(primary_data, [train_len, test_len])

train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=True)

print(len(train_set))
print(len(test_set))

num_feats = np.array([train_set[i][0]for i in range(len(train_set))])
num_labels = np.array([train_set[i][1]for i in range(len(train_set))])

a = iter(train_loader)
b = next(a)
b = np.asarray(b)
print(b.shape)
inp_size = (b[0].shape)[1]
print(inp_size)

8950
2238
(2,)
400


  num_feats = np.array([train_set[i][0]for i in range(len(train_set))])
  num_feats = np.array([train_set[i][0]for i in range(len(train_set))])
  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)


In [9]:
import itertools
ab = list(itertools.chain(*[i[0] for i in clean_truth_data]))
print(len(ab))
ab = set([int(i) for i in ab])
emb_dim = len(ab)

4475200


In [17]:
import torch.nn as nn
import torch.nn.functional as F

class RecurrentClassifier(nn.Module):
    def __init__(self, embedding_dim, input_size, hidden_size, output_size, num_layers, dropout=0.3):
        super(RecurrentClassifier, self).__init__()

        self.embedding = nn.Embedding(embedding_dim, input_size)
        self.rnn = nn.LSTM(input_size, 
                            hidden_size,
                            num_layers,
                            batch_first = True,
                            dropout=dropout)
        self.fc1 = nn.Linear(hidden_size*2, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.embedding(x)
        x, (hidden, cell) = self.rnn(x)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1, :, :]), dim=1))
        x = self.fc1(hidden)
        x = self.dropout(self.fc2(x))

        return x


max_len = len(train_set[1][0])
ref_check = 5
print(max_len)

net = RecurrentClassifier(emb_dim, int(inp_size), 50, ref_check, 2, dropout=0.2)
print(net)

400
RecurrentClassifier(
  (embedding): Embedding(13784, 400)
  (rnn): LSTM(400, 50, num_layers=2, batch_first=True, dropout=0.2)
  (fc1): Linear(in_features=100, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=5, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [19]:

LR = 1e-3
optimizer = torch.optim.Adam(net.parameters(), lr=LR, weight_decay=5e-3)
loss_func = torch.nn.CrossEntropyLoss()

epochs = 1000
losses = []

for step in range(epochs):
    running_loss = 0.0
    for i, data in enumerate(train_loader):
        inp, labels = data
        inp, labels = inp.long(), labels.long()
        optimizer.zero_grad()
        outputs = net(inp)
        cost = loss_func(outputs, labels)
        cost.backward()
        optimizer.step()

        running_loss += cost.item()
    print(f'Epoch: {step}   Training Loss: {running_loss/len(train_loader)}')
print('Training Complete')  