## Data Preprocessing

In [None]:
import warnings, re
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
train_data=pd.read_csv('/kaggle/input/fake-news/train.csv',index_col='id')
test_data=pd.read_csv('/kaggle/input/fake-news/test.csv',index_col='id')

Checking if there are overlapping missing values from **all** arrtibutes of a given **entry**

In [None]:
train_data.isna().sum()

Aparently there's no missing values from the labels, but there are some on the attributes,<br>
following is to remove and check overlapping missing values

In [None]:
train_data.dropna(how='all').isna().sum()

Same is applicable for the **Test Data** too

In [None]:
print(test_data.isna().sum(),'\n')
print(test_data.dropna(how='all').isna().sum())

### Handeling Missing Values

> **id**: *No NaNs* <br>
> **title**: *'Not Mentioned' to replace NaN*<br>
> **author**: *'Not Mentioned' to replace NaN*<br>
> **text**: *'Not Mentioned' to replace NaN*<br>
> **label**: *No NaN*<br>

In [None]:
train_data.fillna('Not Mentioned',inplace=True)
test_data.fillna('Not Mentioned',inplace=True)

As the data consists of **Author** to a certain article, 

In [None]:
train_data.head()

In [None]:
train_data['author'].value_counts()

Number of fake labels and  no. real labels:

In [None]:
len(train_data.loc[train_data['label']==1]), len(train_data.loc[train_data['label']==0])

Its almost a 50-50 split, and we can say the training data is balanced

### Data Cleaning

In [None]:
def text_cleaning(text):
    """
    Removing all characters except alphabets
    """
    text = re.sub(r'[^a-z]', ' ', text.lower())
    return text

train_data['text']=train_data['text'].apply(text_cleaning)
train_data['title']=train_data['title'].apply(text_cleaning)
train_data['author']=train_data['author'].apply(text_cleaning)

#applying the same preprocessing for test data
test_data['text']=test_data['text'].apply(text_cleaning)
test_data['title']=test_data['title'].apply(text_cleaning)
test_data['author']=test_data['author'].apply(text_cleaning)

## Data Ingestion Pipe using TorchText

In [None]:
from sklearn.model_selection import train_test_split

train_data, val_data=train_test_split(train_data, test_size=0.1, shuffle=True)
len(train_data), len(val_data)

In [None]:
# !mkdir cache

# train_data.to_csv('cache/train.csv', index=False)
# val_data.to_csv('cache/val.csv', index=False)
# test_data.to_csv('cache/test.csv', index=False)

# del val_data, test_data

### Word Embeddings
This ingestion pipeline is for using **Word Embeddings**

In [None]:
import spacy
from nltk.corpus import stopwords

spacy_nlp=spacy.load('en')
stopword_list=stopwords.words('english')

def tokenization(text, MAX_LEN=20000):
    text=re.sub(' +', ' ',
                re.sub(r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]",
                       ' ',text))
    text=text if len(text)<=MAX_LEN else text[:MAX_LEN]
    return [x.text for x in spacy_nlp.tokenizer(text) if (x.text!=' ') and (x.text not in stopword_list)]

In [None]:
from torchtext.vocab import Vectors, Vocab
from collections import Counter


gloveVectors=Vectors(name='../input/glove-global-vectors-for-word-representation/glove.6B.100d.txt')

counter = Counter()
for i in tqdm(train_data.index):
    counter.update(tokenization(train_data['text'][i]+' '+train_data['title'][i]+' '+train_data['author'][i]))
#     counter.update(i.text+i.keyword+i.location)
    
vocabulary=Vocab(counter, max_size=20000, min_freq=4, vectors=gloveVectors, specials=['<pad>', '<unk>'])

print('Embedding vocab size: ', vocabulary.vectors.size(0))

In [None]:
import torch, torchtext, warnings
from torchtext.data import Field, LabelField, Dataset, Example, TabularDataset, BucketIterator

warnings.filterwarnings("ignore")


class NewsDataset(Dataset):
    def __init__(self, df, fields, **kwargs):
        examples=[]
        for i, row in df.iterrows():
            examples.append(Example.fromlist([row.title, row.author, row.text, row.label],
                                                           fields))
        #print(i,row.author)
        super().__init__(examples, fields, **kwargs)
        
    @staticmethod
    def sort_key(x):
        return len(x.text)
    
    @classmethod
    def splits(cls, fields, train_df=None, val_df=None, **kwargs):
        train_data, val_data=(None, None)
        
        if train_df is not None:
            train_data=cls(train_df.copy(), fields, **kwargs)
            
        if val_df is not None:
            val_data=cls(val_df.copy(), fields, **kwargs)
            
        return tuple(d for d in (train_data, val_data) if d is not None)
    

Text=Field(tokenization, include_lengths=True)
Title=Field(tokenization, include_lengths=True)
Author=Field(tokenization, include_lengths=True)
Label=LabelField(dtype=torch.float)

Text.vocab=vocabulary
Title.vocab=vocabulary
Author.vocab=vocabulary

fields= [('title', Title),('author', Author), ('text', Text),('label', Label)]

train_ds, val_ds= NewsDataset.splits(fields, train_df=train_data, val_df=val_data)

# train_data, val_data= TabularDataset.splits(path='cache',
#                                            train='train.csv',
#                                            validation='val.csv',
#                                            skip_header=True,
#                                            format='csv',
#                                            fields=fields)

Label.build_vocab(train_ds)


del train_data, val_data
#sampling random example
#print(vars(train_ds[61]))

In [None]:
#print(vars(train_data.examples[0]))
print(f'Number of training examples: {len(train_ds)}')
print(f'Number of validation examples: {len(val_ds)}')

Patching missing **text** in process of the tokenization

In [None]:
for i, elem in enumerate(train_ds):
    bucket=vars(elem)
    if len(bucket['text'])<1:
#         print(bucket)
        
        if len(bucket['title'])>0:
            train_ds[i].text=train_ds[i].title
        else:
            train_ds[i].text=['not','mentioned']
        print(vars(train_ds[i]))
        
for i, elem in enumerate(val_ds):
    bucket=vars(elem)
    if len(bucket['text'])<1:
#         print(bucket)
        
        if len(bucket['title'])>0:
            val_ds[i].text=val_ds[i].title
        else:
            val_ds[i].text=['not','mentioned']
        print(vars(val_ds[i]))

In [None]:
batch_size=64
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator= BucketIterator.splits(
                                (train_ds, val_ds),
                                batch_size=batch_size,
                                sort_within_batch=True,
                                device=device)

#### Sanity check for zero-len texts

In [None]:
for i in train_iterator:
    if 0 in i.text[1]:
        print(i.text[1], i.text[0])

for i in valid_iterator:
    if 0 in i.text[1]:
        print(i.text[1], i.text[0])

## Simple Single channel--text biLSTM

In [None]:
import torch.nn as nn

lr=0.001

input_dims=len(vocabulary)
embedding_dims=100
hidden_dims=128
output_dims=1
n_layers=2
bidirectional=True
dropoutP=0.2

pad_idx=0

def bin_accuracy(preds, y):
    correct=(torch.round(torch.sigmoid(preds))==y).float()
    return correct.sum()/len(correct)

class biLSTM_single(nn.Module):
    def __init__(self, vocab_size, embedding_dims, hidden_dims,
              n_layers,bidirectional, dropoutP, output_dims, pad_idx):
        
        super().__init__()
        self.embeddings=nn.Embedding(vocab_size, embedding_dims, padding_idx=pad_idx)
        
        self.lstm=nn.LSTM(embedding_dims, hidden_dims, 
                          num_layers=n_layers, bidirectional=bidirectional,
                          dropout=dropoutP)
        
        self.fc1=nn.Linear(hidden_dims*2, hidden_dims)
        self.fc2=nn.Linear(hidden_dims, output_dims)
        self.drop=nn.Dropout(dropoutP)
        
    def forward(self, text, text_len):
        #print(text.shape)
        #[seq_len, batch_size]-->[seq_len, batch_size, embedding_dims]
        embedding=self.embeddings(text) 
        #print(embedding.shape)
        #[seq_len, batch_size, embedding_dims] -> [seq_len*batch_size, embedding_dims]
        packed_embeddings=nn.utils.rnn.pack_padded_sequence(embedding, text_len)
        #hidden:[num_layers * num_dir, batch_size, hidden_dims]
        packed_out, (hidden, cell_state)=self.lstm(packed_embeddings)
        #print(hidden.shape)
        #[num_layers * num_dir, batch_size, hidden_dims] -> [batch_size, hidden_dims*2]
        hidden=self.drop(torch.cat((hidden[-2,:,:],hidden[-1,:,:]), dim=1))
        #print(hidden.shape)
        #[batch_size, hidden_dims*2] --> [batch_size, hidden_dims]
        output=self.drop(self.fc1(hidden))
        #print(output.shape)
        #[batch_size, hidden_dims]->[batch_size, out_dims]
        output=self.fc2(output)
        #print(output.shape)
        return output
    
singleChannelBiLSTM=biLSTM_single(input_dims, embedding_dims, hidden_dims,
                                  n_layers, bidirectional, dropoutP, output_dims, pad_idx)

singleChannelBiLSTM.embeddings.weight.data.copy_(vocabulary.vectors)

singleChannelBiLSTM.to(device)

criterion=nn.BCEWithLogitsLoss()
optimizer=torch.optim.Adam(singleChannelBiLSTM.parameters(), lr=lr)
print(sum(p.numel() for p in singleChannelBiLSTM.parameters() if p.requires_grad),' trainable prams')

#loading from previous
singleChannelBiLSTM.load_state_dict(torch.load('../input/fakenewsmodelweights/singleBiLstm-bestLoss.pt',map_location='cpu'))

In [None]:
def train(model, iterator):
    epoch_loss, epoch_acc = 0, 0
    
    model.train()
    
    for batch in tqdm(iterator, total=len(iterator)):
        text, text_len = batch.text
        #print(text_len)
        optimizer.zero_grad()
        preds=model(text, text_len).squeeze(1)
        #print(preds)
        loss=criterion(preds, batch.label)
        acc=bin_accuracy(preds, batch.label)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss+=loss.item()
        epoch_acc+=acc.item()
        #print(epoch_loss, epoch_acc)
    return (epoch_loss/len(iterator), epoch_acc/len(iterator))

def evaluate(model, iterator):
    epoch_loss, epoch_acc = 0, 0
    
    model.eval()
    
    with torch.no_grad():
        for batch in tqdm(iterator, total=len(iterator)):
            text, text_len = batch.text

            preds=model(text, text_len).squeeze(1)
            #print(preds)
            loss=criterion(preds, batch.label)
            acc=bin_accuracy(preds, batch.label)
            
            epoch_acc+=acc.item()
            epoch_loss+=loss.item()
            #print(epoch_acc)
    return (epoch_loss/len(iterator),epoch_acc/len(iterator))

from time import time

startT=time()

loss, val_loss, acc, val_acc=[], [], [], []

EPOCHS=10

best_loss=999

for epoch in range(EPOCHS):
    train_loss, train_acc= train(singleChannelBiLSTM, train_iterator)
    print(f'Epoch {epoch}/{EPOCHS}\nTraining: Loss {train_loss} Accuracy {train_acc}')
    valid_loss, valid_acc= evaluate(singleChannelBiLSTM, valid_iterator)
    print(f'Epoch {epoch}/{EPOCHS}\nValidation: Loss {valid_loss} Accuracy {valid_acc}')
    
    loss.append(train_loss)
    acc.append(train_acc)
    
    val_acc.append(valid_acc)
    val_loss.append(valid_loss)

    if valid_loss<best_loss:
        best_loss=valid_loss
        torch.save(singleChannelBiLSTM.state_dict(),'singleBiLstm-bestLoss.pt')
        
print(time()-startT)

In [None]:
torch.save(singleChannelBiLSTM.state_dict(),'singleBiLstmv2.pt')

In [None]:
plt.plot(range(EPOCHS),loss)
plt.plot(range(EPOCHS),acc)
plt.plot(range(EPOCHS),val_loss)
plt.plot(range(EPOCHS),val_acc)
plt.show()

In [None]:
plt.plot(range(EPOCHS),loss)
plt.plot(range(EPOCHS),acc)
plt.plot(range(EPOCHS),val_loss)
plt.plot(range(EPOCHS),val_acc)
plt.show()

In [None]:
import json

with open('history.json','w') as fp:
    json.dump({'train loss':loss,
               'train accuracy':acc,
                'train loss':val_loss,
               'train accuracy':val_acc,})

## Evaluate

In [None]:
#singleChannelBiLSTM.load_state_dict(torch.load('../input/fakenewsmodelweights/singleBiLstm-bestLoss.pt',map_location='cpu'))
# singleChannelBiLSTM.eval()
singleChannelBiLSTM.to(torch.device('cpu'))
def infer(text, author=None, title=None, preprocessed=False):
    singleChannelBiLSTM.eval()
    
    if not preprocessed:
        text_arr=[vocabulary.stoi[token] for token in tokenization(text)]
    else:
        text_arr=[vocabulary.stoi[token] for token in text]
    
    if len(text_arr):
        with torch.no_grad():
            text=torch.LongTensor([text_arr]).view(-1,1)
            text_len=torch.LongTensor([text.shape[1]])
            return int(torch.round(torch.sigmoid(singleChannelBiLSTM(text, text_len).squeeze(1))).item())
    else:
        return 0

# test_preds=[]
# for i in tqdm(test_data.iterrows(), total=len(test_data)):
#     test_preds.append(infer(i[1]['text']))

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

true=[label for label in val_ds.label]
predicted=[infer(val.text, preprocessed=True) for val in val_ds]


print(confusion_matrix(true , predicted),'\n\n\n')
print(classification_report(true,predicted,target_names=['real','fake']))

In [None]:
my_submissions=pd.DataFrame({'id':test_data.index.values,'label':test_preds})
my_submissions.to_csv('submission.csv', index=False)