In [13]:
from data import data_dict, DIRECTORY
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from nltk.tokenize import word_tokenize
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import nltk
import torch
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re


class BiasDataset(Dataset):

    def __init__(self, file, root, x_col, y_col, meta_columns, label_idx = -1):
        self.data = pd.read_csv(file)
        self.og_data = self.data
        self.data = self.data.sample(frac=1).reset_index(drop=True)

        self.tokenized_data, self.base_ref = self.tokenizer(self.data, [x_col])
        self.clean_data, self.errors = self.word_vector(self.tokenized_data.drop(meta_columns, axis=1), self.base_ref, [x_col])
        
        self.data = self.vectorize(self.clean_data, [y_col])
        self.data = self.padding(self.padding(self.data, [x_col]), [x_col])
        self.data = self.data.to_numpy()

        max_len = max([len(i[0]) for i in self.data])

        for idx, ent_item in enumerate(self.data):
            add_array = np.array([0]*(max_len-len(ent_item[0])))
            self.data[idx][0] = np.concatenate((ent_item[0], add_array))

        
        self.root = root
        self.transform = transforms.Compose([transforms.ToTensor()])

    def transform(self, data, col_names):
        for col in col_names:
            uniques = [list(set(data[col].values))][0]
            uniques = [x for x in uniques if str(x) != 'nan']
            one_hot = np.identity(len(uniques))
            one_hot = [str(i) for i in one_hot.tolist()]
            one_dict = dict(zip(uniques, one_hot))
            data = data.replace(one_dict)
        return data

    def format_text(self, token):
        clean_token = ''.join(chr for chr in token if chr.isalnum() and chr.isalpha())
        return clean_token

    def tokenizer(self, data, text_cols, window=2):
        all_text = set()
        transformed_data = data
        for x in text_cols:
            for idx, entry in enumerate(data[x]):
                clean_entry = list(map(self.format_text, (word for word in entry)))
                append_all_text = set()
                for y, char in enumerate(clean_entry):
                    if char == '':
                        clean_entry[y] = ' '
                all_words = ''.join(i.lower() for i in clean_entry)
                transformed_data[x][idx] = all_words
                
                for m in set(all_words.split(' ')):
                    all_text.add(m)

        return transformed_data, dict(zip(list(all_text), [z for z in range(len(all_text))]))

    def word_vector(self, data, ref, text_cols):
        bag_dataset = data
        errors = []
        for row in text_cols:
            for idx, entry in enumerate(data[row]):
                list_entry = entry.split(' ')
                try:
                    vector = torch.FloatTensor([ref[word.lower()] for word in list_entry])
                except:
                    errors.append([list_entry, idx])
                bag_dataset[row][idx] = vector
        
        return bag_dataset, errors

    def vectorize(self, data_inp, columns):
        data = data_inp
        for column in columns:
            labels = list(data[column].unique())
            ref = dict(zip(data[column].unique(), [i for i in range(len(labels))]))
            print(ref)
            for idx, val in enumerate(data[column]):
                vectorized = ref[data[column][idx]]
                data[column][idx] = torch.tensor(vectorized, dtype=float)
        return data
    
    def padding(self, data, x_column):
        deep_copy = data
        max_len = max([len(i) for i in deep_copy[x_column]])
        for idx, i in enumerate(deep_copy[x_column]):
            if len(i) != max_len:
                flag = False
                print(max_len-len(i))
                print(deep_copy[x_column][idx], torch.cat((deep_copy[x_column][idx], torch.FloatTensor([0]*(max_len-len(i))))))
                deep_copy[x_column][idx] = torch.cat((deep_copy[x_column][idx], torch.FloatTensor([0]*(max_len-len(i)))))
            else:
                flag = True
                pass
        else:
            return deep_copy

    
    def __len__ (self):
        return len(self.data)
    
    def __getitem__ (self, idx):
        self.transpose_data = self.data
        self.transpose_data = self.transpose_data.transpose()
        x_data = self.transpose_data[0]
        y_data = self.transpose_data[1]

        return x_data[idx], y_data[idx]
    

truth_data = BiasDataset(data_dict['politifact_clean'], DIRECTORY, 'statement', 'veracity', ['source', 'link'])
print(truth_data[0])

{'True': 0, 'False': 1, 'Mostly False': 2, 'Mostly True': 3, 'Pants on Fire!': 4}
(array([ 8502.,   787.,  7980., 10387.,  2915.,     0.,  4040.,  1729.,
           0.,     0.,     0.,  3944.,  6442.,  7842.,  6275.,  7639.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,   

In [14]:
from keras.preprocessing import sequence
import keras
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split


class PreprocessingDataset(Dataset):
    def __init__(self, file, root, x_col, y_col, meta_columns, label_idx = -1):
        self.x_col = x_col
        self.y_col = y_col
        self.data = pd.read_csv(file)
        self.data = self.data.sample(frac=1).reset_index(drop=True)
        self.data = self.data.drop(meta_columns, axis=1)

        # self.data, self.base_ref = self.tokenizer(self.data, [x_col])
        self.x_data = self.data[x_col]
        self.max_len = max([len(i) for i in self.x_data])

        self.x_data = self.word_vector(self.x_data)
        self.data[x_col] = [torch.FloatTensor(i) for i in self.x_data]
        self.data = self.vectorize(self.data, [y_col])

        self.data = self.data.to_numpy()

        self.root = root
        self.transform = transforms.Compose([transforms.ToTensor()])

    def format_text(self, token):
        clean_token = ''.join(chr for chr in token if chr.isalnum() and chr.isalpha())
        return clean_token

    def word_vector(self, data):
        x_data = data
        x_data = list(x_data)
        maximum_length = 0
        max_idx = 0
        for idx, i in enumerate(x_data):

            if len(i) > maximum_length:
                maximum_length = len(i)
                max_idx = idx
        
        t = Tokenizer()
        t.fit_on_texts(x_data)
        sequences = t.texts_to_sequences(x_data)
        sequences = keras.preprocessing.sequence.pad_sequences(sequences, maxlen=maximum_length)
        print(x_data[0])
        print(len(x_data[0]))
        print(sequences[0])

        return sequences


    def vectorize(self, data_inp, columns):
        data = data_inp
        for column in columns:
            labels = list(data[column].unique())
            ref = dict(zip(data[column].unique(), [i for i in range(len(labels))]))
            print(ref)
            for idx, val in enumerate(data[column]):
                vectorized = ref[data[column][idx]]
                data[column][idx] = torch.tensor(vectorized, dtype=float)
        return data

    def __len__ (self):
        return len(self.data)

    def __getitem__ (self, idx):
        
        self.transpose_data = self.data
        self.transpose_data = self.transpose_data.transpose()
        x_data = self.transpose_data[0]
        y_data = self.transpose_data[1]

        return x_data[idx], y_data[idx]

clean_truth_data = PreprocessingDataset(data_dict['politifact_clean'], DIRECTORY, 'statement', 'veracity', ['source', 'link'])

"The national economic recovery has led to higher than expected tax revenues and projected budget surpluses in nearly every state in the nation, including Wisconsin."
166
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0   

In [15]:
print(clean_truth_data[0])

(tensor([0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e

In [19]:

import random
BATCH_SIZE = 64

primary_data = clean_truth_data #secondary option of truth_data

train_len = int(len(primary_data)*0.8)
test_len = len(primary_data) - train_len

train_set, test_set = torch.utils.data.random_split(primary_data, [train_len, test_len])

train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=True)

# print(len(train_set))
# print(len(test_set))
# print(len(primary_data))

num_feats = np.array([train_set[i][0]for i in range(len(train_set))])
num_labels = np.array([train_set[i][1]for i in range(len(train_set))])


# print(num_feats.shape)
# print(num_labels.shape)

if primary_data == clean_truth_data:
    a = iter(train_loader)
    b = np.array(next(a))
    inp_size = (b[0].shape)[1]
else:
    inp_size = str(num_feats[0].shape)[-5:-2]

print(inp_size)


  num_feats = np.array([train_set[i][0]for i in range(len(train_set))])
  num_feats = np.array([train_set[i][0]for i in range(len(train_set))])


400


  b = np.array(next(a))
  b = np.array(next(a))


In [22]:
import torch.nn as nn
import torch.nn.functional as F

class FeedForward(nn.Module):
    def __init__(self, num_classes, input_size, kernel_size=4):
        super(FeedForward, self).__init__()
        self.fc1 = nn.Linear(input_size, 100)
        self.fc3 = nn.Linear(100, 75)        
        self.fc4 = nn.Linear(75, 20)
        self.fc6 = nn.Linear(20, num_classes)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc6(x))
        
        return x

class RecurrentClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, dropout=0.3):
        super(RecurrentClassifier, self).__init__()

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.LSTM(hidden_size, 
                            hidden_size,
                            num_layers,
                            dropout)
        self.fc1 = nn.Linear(hidden_size*2, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.embedding(x)
        x, (hidden, cell) = self.rnn(x)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1, :, :]), dim=1))
        x = self.fc1(hidden)
        x = self.dropout(self.fc2(x))

        return x


max_len = len(train_set[1][0])
ref_check = 5
print(max_len)

# net = RecurrentClassifier(int(inp_size), 256, ref_check, 3, dropout=0.2)
net = FeedForward(ref_check, inp_size)
print(net)

400
FeedForward(
  (fc1): Linear(in_features=400, out_features=100, bias=True)
  (fc3): Linear(in_features=100, out_features=75, bias=True)
  (fc4): Linear(in_features=75, out_features=20, bias=True)
  (fc6): Linear(in_features=20, out_features=5, bias=True)
)


In [54]:

LR = 1e-3
optimizer = torch.optim.Adam(net.parameters(), lr=LR, weight_decay=5e-3)
loss_func = torch.nn.CrossEntropyLoss()

epochs = 500
losses = []

for step in range(epochs):
    running_loss = 0.0
    for i, data in enumerate(train_loader):
        inp, labels = data
        inp, labels = inp.float(), labels.long()
        optimizer.zero_grad()
        outputs = net(inp)
        cost = loss_func(outputs, labels)
        cost.backward()
        optimizer.step()

        running_loss += cost.item()
    print(f'Epoch: {step}   Training Loss: {running_loss/len(train_loader)}')
print('Training Complete')  



# class CNNTextClassifier(nn.Module):
#     def __init__ (self, oshape, ishape):
#         super(CNNTextClassifier, self).__init__() 
#         self.conv1 = nn.Conv1d(ishape, 150)
#         self.maxpool1 = nn.MaxPool1d(150)
#         self.conv2 = nn.Conv1d(150, 100)
#         self.maxpool2 = nn.MaxPool1d(100)
#         self.conv3 = nn.Conv1d(100, 50)
#         self.maxpool3 = nn.MaxPool1d(50)
#         self.fc1 = nn.Linear(50, 25)
#         self.fc2 = nn.Linear(25, oshape)
    
#     def forward(self, x):
#         x = self.maxpool1(F.relu(self.conv1(x)))
#         x = self.maxpool2(F.relu(self.conv2(x)))
#         x = self.maxpool3(F.relu(self.conv3(x)))
#         x = x.flatten()
#         x = F.relu(self.fc1(x))
#         x = F.relu(self.fc2(x))

#         return x


Epoch: 0   Training Loss: 1.6022040946143015
Epoch: 1   Training Loss: 1.5965734141213552
Epoch: 2   Training Loss: 1.594014415570668
Epoch: 3   Training Loss: 1.59106639112745
Epoch: 4   Training Loss: 1.5878231644630432
Epoch: 5   Training Loss: 1.584990439244679
Epoch: 6   Training Loss: 1.5818960785865783
Epoch: 7   Training Loss: 1.5791167744568415
Epoch: 8   Training Loss: 1.5764760971069336
Epoch: 9   Training Loss: 1.5728277700287956
Epoch: 10   Training Loss: 1.5707032663481577
Epoch: 11   Training Loss: 1.5673700290066854
Epoch: 12   Training Loss: 1.5629774570465087
Epoch: 13   Training Loss: 1.5592061919825417
Epoch: 14   Training Loss: 1.5561555334499904
Epoch: 15   Training Loss: 1.5495520685400281
Epoch: 16   Training Loss: 1.545477521419525
Epoch: 17   Training Loss: 1.5395560349736894
Epoch: 18   Training Loss: 1.5344181162970407
Epoch: 19   Training Loss: 1.531307884625026
Epoch: 20   Training Loss: 1.522511110986982
Epoch: 21   Training Loss: 1.519940481015614
Epoch:

In [71]:
total = 0
acc = 0

for i, data in enumerate(test_loader):
    inp, labels = data
    optimizer.zero_grad()

    output = net(inp.float())
    output = output.detach().numpy()
    output = list(output)
    output = [list(i).index(max(i)) for i in output]
    
    for idx, item in enumerate(torch.tensor(output)):
        total += 1
        if item == labels[idx]:
            acc += 1
print(f'{acc/total*100}%')



22.83288650580876%


In [100]:


# def format_text(token):
#     clean_token = ''.join(chr for chr in token if chr.isalnum() and chr.isalpha())
#     return clean_token


# def tokenizer(data, text_cols, window=2):
#     all_text = set()
#     transformed_data = data
#     for x in text_cols:
#         for idx, entry in enumerate(data[x]):
#             clean_entry = list(map(format_text, (word for word in entry)))
#             append_all_text = set()
#             for y, char in enumerate(clean_entry):
#                 if char == '':
#                     clean_entry[y] = ' '
#             all_words = ''.join(i.lower() for i in clean_entry)
#             transformed_data[x][idx] = all_words
            
#             for m in set(all_words.split(' ')):
#                 all_text.add(m)

#     return transformed_data, dict(zip(list(all_text), [z for z in range(len(all_text))]))
            
# truth_data = BiasDataset(data_dict['politifact'], DIRECTORY).data
# truth_data_clean = BiasDataset(data_dict['politifact_clean'], DIRECTORY).data
# truth_data_binary = BiasDataset(data_dict['politifact_clean_binarized'], DIRECTORY).data


# truth_preprocessed, base_ref = tokenizer(truth_data_clean, ['statement'])
# truth_preprocessed.head()
# ##Custom Vector Embeddings
# def word_vector(data, ref, text_cols):
#     bag_dataset = data
#     errors = []
#     for row in text_cols:
#         for idx, entry in enumerate(data[row]):
#             list_entry = entry.split(' ')
#             try:
#                 vector = torch.FloatTensor([ref[word.lower()] for word in list_entry])
#             except:
#                 errors.append([list_entry, idx])
#             bag_dataset[row][idx] = vector
    
#     return bag_dataset, errors



# truth_data, errors = word_vector(truth_preprocessed.drop(['source', 'link'], axis=1), base_ref, ['statement'])
# truth_data.head()

# labels_check = list(truth_data['veracity'].unique())
# ref_check = dict(zip(labels_check, [i for i in range(len(labels_check))]))
# print(ref_check)

# def vectorize(data_inp, columns):
#     data = data_inp
#     for column in columns:
#         labels = list(truth_data[column].unique())
#         ref = dict(zip(data[column].unique(), [i for i in range(len(labels))]))
#         print(ref)
#         for idx, val in enumerate(data[column]):
#             vectorized = ref[data[column][idx]]
#             data[column][idx] = torch.tensor(vectorized, dtype=float)
#     return data

# truth_processed = vectorize(truth_data, ['veracity'])

# truth_processed.head()

# truth_vector = truth_processed

# max_len = max([len(i) for i in truth_vector['statement']])
# for idx, i in enumerate(truth_vector['statement']):
#     if len(i) != max_len:
#         flag = False
#         truth_vector['statement'][idx] = torch.cat((truth_vector['statement'][idx], torch.FloatTensor([0]*(max_len-len(i)))), 0)
#         # print(len(truth_vector['statement'][idx]))
#     else:
#         flag = True

# print(flag)
# truth_vector.head() 

# processed_dataset = truth_vector

# train_len = int(len(processed_dataset)*0.8)
# test_len = len(processed_dataset) - train_len

# train_dataset = processed_dataset.sample(n=train_len)
# test_dataset = processed_dataset[~processed_dataset.isin(train_dataset)].dropna()
# test_dataset.reset_index(inplace=True)
# train_dataset.reset_index(inplace=True)

# train_dataset = BiasDataset(train_dataset, DIRECTORY)
# test_dataset = BiasDataset(test_dataset, DIRECTORY)

# print(len(train_dataset))
# print(len(test_dataset))
# print(len(processed_dataset))

# train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True)
