In [1]:
#!pip install transformers

In [2]:
#!pip install tensorflow

In [3]:
import transformers

In [4]:
import os
import math
from sklearn.model_selection import train_test_split

import torch
from torch.nn import BCEWithLogitsLoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, XLNetTokenizer, XLNetModel, XLNetLMHeadModel, XLNetConfig
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from tqdm import tqdm, trange
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
print("GPU Available: {}".format(torch.cuda.is_available()))
n_gpu = torch.cuda.device_count()
print("Number of GPU Available: {}".format(n_gpu))
print("GPU: {}".format(torch.cuda.get_device_name(0)))

GPU Available: True
Number of GPU Available: 16
GPU: Tesla K80


In [6]:
df_true = pd.read_csv('True.csv')
df_fake = pd.read_csv('Fake.csv')

In [7]:
df_true['label'] = 0
df_fake['label'] = 1

In [8]:
print(df_true.shape,df_fake.shape)


(21417, 5) (23481, 5)


In [9]:
# pd.set_option('display.max_colwidth', -1)
# df_true[['title','text']].sample(2)

In [10]:
df_full = pd.concat([df_true,df_fake],axis=0)

In [11]:
df_full.columns

Index(['title', 'text', 'subject', 'date', 'label'], dtype='object')

In [12]:
df_full = df_full[['title', 'text','label']]
df_full.dropna(inplace=True)

In [13]:
df_full['fulltext'] = df_full['title'] + ' ' + df_full['text']

In [15]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True)

In [16]:
train,test = train_test_split(df_full,test_size = 0.5,random_state=42)

In [17]:
train.shape

(22449, 4)

In [18]:
test.shape

(22449, 4)

In [19]:
train_text_list = train['fulltext'].values
test_text_list = test['fulltext'].values

In [20]:
def plot_sentence_embeddings_length(text_list, tokenizer):
    tokenized_texts = list(map(lambda t: tokenizer.tokenize(t), text_list))
    tokenized_texts_len = list(map(lambda t: len(t), tokenized_texts))
    fig, ax = plt.subplots(figsize=(8, 5));
    ax.hist(tokenized_texts_len, bins=40);
    ax.set_xlabel("Length of Comment Embeddings");
    ax.set_ylabel("Number of Comments");
    return

In [21]:
#plot_sentence_embeddings_length(train_text_list, tokenizer)

In [22]:
#plot_sentence_embeddings_length(test_text_list, tokenizer)

In [23]:
num_embeddings=126

In [24]:
def tokenize_inputs(text_list, tokenizer, num_embeddings=num_embeddings):
    """
    Tokenizes the input text input into ids. Appends the appropriate special
    characters to the end of the text to denote end of sentence. Truncate or pad
    the appropriate sequence length.
    """
    # tokenize the text, then truncate sequence to the desired length minus 2 for
    # the 2 special characters
    tokenized_texts = list(map(lambda t: tokenizer.tokenize(t)[:num_embeddings-2], text_list))
    # convert tokenized text into numeric ids for the appropriate LM
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    # append special token "<s>" and </s> to end of sentence
    input_ids = [tokenizer.build_inputs_with_special_tokens(x) for x in input_ids]
    # pad sequences
    input_ids = pad_sequences(input_ids, maxlen=num_embeddings, dtype="long", truncating="post", padding="post")
    return input_ids

def create_attn_masks(input_ids):
    """
    Create attention masks to tell model whether attention should be applied to
    the input id tokens. Do not want to perform attention on padding tokens.
    """
    # Create attention masks
    attention_masks = []

    # Create a mask of 1s for each token followed by 0s for padding
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)
    return attention_masks

In [25]:
len(train_text_list)

22449

In [26]:
train_input_ids = tokenize_inputs(train_text_list, tokenizer, num_embeddings=num_embeddings)

In [27]:
test_input_ids = tokenize_inputs(test_text_list, tokenizer, num_embeddings=num_embeddings)

In [28]:
train_attention_masks = create_attn_masks(train_input_ids)

In [29]:
test_attention_masks = create_attn_masks(test_input_ids)

In [30]:
train_input_ids.shape

(22449, 126)

In [31]:
train = train.copy()
test = test.copy()

In [32]:
# add input ids and attention masks to the dataframe
train["features"] = train_input_ids.tolist()
train["masks"] = train_attention_masks

test["features"] = test_input_ids.tolist()
test["masks"] = test_attention_masks

In [33]:
train, valid = train_test_split(train, test_size=0.2, random_state=42)

In [34]:
X_train = train["features"].values.tolist()
X_valid = valid["features"].values.tolist()

train_masks = train["masks"].values.tolist()
valid_masks = valid["masks"].values.tolist()

Y_train = [[0,1] if label==1 else [1,0]\
           for label in list(train['label']) ]
Y_valid = [[0,1] if label==1 else [1,0]\
           for label in list(valid['label']) ]

In [35]:
X_train = torch.tensor(X_train)
X_valid = torch.tensor(X_valid)

Y_train = torch.tensor(Y_train, dtype=torch.float32)
Y_valid = torch.tensor(Y_valid, dtype=torch.float32)

train_masks = torch.tensor(train_masks, dtype=torch.long)
valid_masks = torch.tensor(valid_masks, dtype=torch.long)

In [36]:
# Select a batch size for training
batch_size = 16

# Create an iterator of our data with torch DataLoader. This helps save on 
# memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(X_train, train_masks, Y_train)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data,\
                              sampler=train_sampler,\
                              batch_size=batch_size)

validation_data = TensorDataset(X_valid, valid_masks, Y_valid)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data,\
                                   sampler=validation_sampler,\
                                   batch_size=batch_size)

In [37]:
def train(model, num_epochs,\
          optimizer,\
          train_dataloader, valid_dataloader,\
          model_save_path,\
          train_loss_set=[], valid_loss_set = [],\
          lowest_eval_loss=None, start_epoch=0,\
          device="gpu"
          ):
    """
    Train the model and save the model with the lowest validation loss
    """

    model.to(device)

    # trange is a tqdm wrapper around the normal python range
    for i in trange(num_epochs, desc="Epoch"):
        # if continue training from saved model
        actual_epoch = start_epoch + i

        # Training

        # Set our model to training mode (as opposed to evaluation mode)
        model.train()

        # Tracking variables
        tr_loss = 0
        num_train_samples = 0

        # Train the data for one epoch
        for step, batch in enumerate(train_dataloader):
            if step > 20000:
                break
            if step % 100 == 0:
                print(step)
            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_labels = batch
            # Clear out the gradients (by default they accumulate)
            optimizer.zero_grad()
            # Forward pass
            loss = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            # store train loss
            tr_loss += loss.item()
            num_train_samples += b_labels.size(0)
            # Backward pass
            loss.backward()
            # Update parameters and take a step using the computed gradient
            optimizer.step()
            #scheduler.step()

    # Update tracking variables
    epoch_train_loss = tr_loss/num_train_samples
    train_loss_set.append(epoch_train_loss)
    print(epoch_train_loss)
    print("Train loss: {}".format(epoch_train_loss))

    # Validation

    # Put model in evaluation mode to evaluate loss on the validation set
    model.eval()

    # Tracking variables 
    eval_loss = 0
    num_eval_samples = 0

    # Evaluate data for one epoch
    for step, batch in enumerate(valid_dataloader):
      # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
        # Forward pass, calculate validation loss
            loss = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        # store valid loss
        eval_loss += loss.item()
        num_eval_samples += b_labels.size(0)

    epoch_eval_loss = eval_loss/num_eval_samples
    valid_loss_set.append(epoch_eval_loss)

    print("Valid loss: {}".format(epoch_eval_loss))

    if lowest_eval_loss == None:
        lowest_eval_loss = epoch_eval_loss
      # save model
        save_model(model, model_save_path, actual_epoch,\
                 lowest_eval_loss, train_loss_set, valid_loss_set)
    else:
        if epoch_eval_loss < lowest_eval_loss:
            lowest_eval_loss = epoch_eval_loss
            # save model
            save_model(model, model_save_path, actual_epoch,\
                       lowest_eval_loss, train_loss_set, valid_loss_set)
    print("\n")

    return model, train_loss_set, valid_loss_set


def save_model(model, save_path, epochs, lowest_eval_loss, train_loss_hist, valid_loss_hist):
    """
    Save the model to the path directory provided
    """
    model_to_save = model.module if hasattr(model, 'module') else model
    checkpoint = {'epochs': epochs, \
        'lowest_eval_loss': lowest_eval_loss,\
        'state_dict': model_to_save.state_dict(),\
        'train_loss_hist': train_loss_hist,\
        'valid_loss_hist': valid_loss_hist
       }
    cwd = os.getcwd()
    save_dir = os.path.join(cwd,'Models')
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    torch.save(checkpoint, save_path)
    print("Saving model at epoch {} with validation loss of {}".format(epochs,\
                                                                     lowest_eval_loss))
    return
  
def load_model(save_path):
    """
    Load the model from the path directory provided
    """
    checkpoint = torch.load(save_path)
    model_state_dict = checkpoint['state_dict']
    model = XLNetForMultiLabelSequenceClassification(num_labels=model_state_dict["classifier.weight"].size()[0])
    model.load_state_dict(model_state_dict)

    epochs = checkpoint["epochs"]
    lowest_eval_loss = checkpoint["lowest_eval_loss"]
    train_loss_hist = checkpoint["train_loss_hist"]
    valid_loss_hist = checkpoint["valid_loss_hist"]

    return model, epochs, lowest_eval_loss, train_loss_hist, valid_loss_hist

In [38]:
torch.cuda.empty_cache()

In [39]:
config = XLNetConfig()
        
class XLNetForMultiLabelSequenceClassification(torch.nn.Module):
  
  def __init__(self, num_labels=2):
    super(XLNetForMultiLabelSequenceClassification, self).__init__()
    self.num_labels = num_labels
    self.xlnet = XLNetModel.from_pretrained('xlnet-base-cased', mem_len=1024)
    self.classifier = torch.nn.Linear(768, num_labels)

    torch.nn.init.xavier_normal_(self.classifier.weight)

  def forward(self, input_ids, token_type_ids=None,\
              attention_mask=None, labels=None):
    # last hidden layer
    last_hidden_state = self.xlnet(input_ids=input_ids,\
                                   attention_mask=attention_mask,\
                                   token_type_ids=token_type_ids)
    # pool the outputs into a mean vector
    mean_last_hidden_state = self.pool_hidden_state(last_hidden_state)
    logits = self.classifier(mean_last_hidden_state)
        
    if labels is not None:
      loss_fct = BCEWithLogitsLoss()
      loss = loss_fct(logits.view(-1, self.num_labels),\
                      labels.view(-1, self.num_labels))
      return loss
    else:
      return logits
    
  def freeze_xlnet_decoder(self):
    """
    Freeze XLNet weight parameters. They will not be updated during training.
    """
    for param in self.xlnet.parameters():
      param.requires_grad = False
    
  def unfreeze_xlnet_decoder(self):
    """
    Unfreeze XLNet weight parameters. They will be updated during training.
    """
    for param in self.xlnet.parameters():
      param.requires_grad = True
    
  def pool_hidden_state(self, last_hidden_state):
    """
    Pool the output vectors into a single mean vector 
    """
    last_hidden_state = last_hidden_state[0]
    mean_last_hidden_state = torch.mean(last_hidden_state, 1)
    return mean_last_hidden_state
    
model = XLNetForMultiLabelSequenceClassification(num_labels=len(Y_train[0]))
#model = torch.nn.DataParallel(model)
#model.cuda()

In [40]:
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01, correct_bias=False)


In [41]:
torch.cuda.empty_cache()

In [42]:
num_epochs=1

cwd = os.getcwd()
model_save_path = output_model_file = os.path.join(cwd, "Models/xlnet_fake.bin")
model, train_loss_set, valid_loss_set = train(model=model,\
                                              num_epochs=num_epochs,\
                                              optimizer=optimizer,\
                                              train_dataloader=train_dataloader,\
                                              valid_dataloader=validation_dataloader,\
                                              model_save_path=model_save_path,\
                                              device="cuda")

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

0
100
200
300
400
500
600
700
800
900
1000
1100


Epoch: 100%|██████████| 1/1 [16:00<00:00, 960.23s/it]


0.0011908560708451452
Train loss: 0.0011908560708451452
Valid loss: 8.085237623173394e-05
Saving model at epoch 0 with validation loss of 8.085237623173394e-05




In [43]:
def generate_predictions(model, df, num_labels, device="gpu", batch_size=32):
    num_iter = math.ceil(df.shape[0]/batch_size)

    pred_probs = np.array([]).reshape(0, num_labels)

    model.to(device)
    model.eval()

    for i in range(num_iter):
        df_subset = df.iloc[i*batch_size:(i+1)*batch_size,:]
        X = df_subset["features"].values.tolist()
        masks = df_subset["masks"].values.tolist()
        X = torch.tensor(X)
        masks = torch.tensor(masks, dtype=torch.long)
        X = X.to(device)
        masks = masks.to(device)
        with torch.no_grad():
            logits = model(input_ids=X, attention_mask=masks)
            logits = logits.sigmoid().detach().cpu().numpy()
            pred_probs = np.vstack([pred_probs, logits])

    return pred_probs

In [44]:
num_labels = 2
pred_probs = generate_predictions(model, test, num_labels, device="cuda", batch_size=32)
pred_probs

array([[1.78410395e-04, 9.99891043e-01],
       [7.14054622e-05, 9.99921918e-01],
       [1.54350273e-04, 9.99890566e-01],
       ...,
       [1.20074728e-05, 9.99992132e-01],
       [1.00000000e+00, 2.42506471e-12],
       [1.00000000e+00, 4.48782027e-12]])

In [45]:
test["real"] = pred_probs[:,0]
test["fake"] = pred_probs[:,1]

In [46]:
test["pred"] = test["fake"] > test["real"] 

In [47]:
test["pred"].value_counts()

True     11798
False    10651
Name: pred, dtype: int64

In [48]:
from sklearn.metrics import accuracy_score
accuracy_score(test["pred"],test["label"])

0.9993763642033052

In [49]:
#test[test['label']==1]['fake'].hist(figsize = (16,18),bins=200);

In [50]:
#test[test['label']==0]['fake'].hist(figsize = (16,18),bins=200);

In [51]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline

# Now use scikit-learn's text classifier to train the model.
vec = TfidfVectorizer(min_df = 3)
model = MultinomialNB()

train,test = train_test_split(df_full,test_size = 0.5,random_state=42)


In [52]:
clf = make_pipeline(vec,model)
clf = clf.fit(train['fulltext'], train['label'])

In [53]:
test_y = test['label']
test_X = test['fulltext']

In [54]:
predictions = clf.predict(test_X.astype('str'))

In [55]:
accuracy_score(test_y,predictions)

0.9368346028776338

In [56]:
predictions

array([1, 1, 1, ..., 1, 0, 0])

In [57]:
neg_class_prob_sorted = model.feature_log_prob_[0, :].argsort()
pos_class_prob_sorted = model.feature_log_prob_[1, :].argsort()

print(np.take(vec.get_feature_names(), neg_class_prob_sorted[:20]))
print(np.take(vec.get_feature_names(), pos_class_prob_sorted[:20]))


['jorgeramosnews' 'ickes' 'icm' 'cdwyer0213' 'icna' 'icnc' 'cdf' 'qosi'
 'cdata' 'cd' 'thump' 'ccbc' 'cbsthismorning' 'cbssports' 'thuggery'
 'cbsnews' 'cbsla' 'cbs2' 'iconography' 'qnfpkcihzu']
['zyuganov' 'rada' 'carme' 'carmakers' 'carlyle' 'radovan' 'carles'
 'rafah' 'rabbu' 'rafale' 'caribou' 'cargoes' 'raggi' 'rai' 'caren'
 'raila' 'railroads' 'rafik' 'rainsy' 'quynh']
