In [42]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fake-and-real-news-dataset/True.csv
/kaggle/input/fake-and-real-news-dataset/Fake.csv


> ## Import and examine data

In [43]:
real_df = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/True.csv')
fake_df = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/Fake.csv')
real_df['label'] = 1
fake_df['label'] = 0
df = pd.concat([real_df, fake_df], axis=0, ignore_index=True)

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   label    44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.7+ MB


## Preprocess data

In [45]:
import warnings   
warnings.filterwarnings(action = 'ignore') 

import re
from string import punctuation
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

## One-hot encode subject

In [46]:
#data = pd.get_dummies(df, columns=['subject'])
data = df
data.drop(['subject'], inplace=True, axis=1)
data.drop(['date'], inplace=True, axis=1)
data.head()

Unnamed: 0,title,text,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,1


In [49]:


def preprocess(df):
    lemmatizer = WordNetLemmatizer()

    text_processed = []
    for text in df.text:
        # remove punctuation and lowercase
        text = re.sub(r'[^a-zA-Z]', ' ', text) 
        text = text.lower()
        
        # tokenize and lemmatize tokens
        tokens = word_tokenize(text)
        tokens = [lemmatizer.lemmatize(x) for x in tokens]
        text_processed.append(' '.join(tokens))
 

    title_processed = []
    for title in df.title:
        # remove punctuation and lowercase
        title = re.sub(r'[^a-zA-Z]', ' ', title) 
        title = title.lower()
        
        # tokenize and lemmatize tokens
        tokens = word_tokenize(title)
        tokens = [lemmatizer.lemmatize(x) for x in tokens]
        title_processed.append(' '.join(tokens))
        
    # vectorize
    text_vectorizer = CountVectorizer(stop_words='english', max_features=4000)
    title_vectorizer = CountVectorizer(stop_words='english', max_features=1000)
    text_matrix = text_vectorizer.fit_transform(text_processed).toarray()
    title_matrix = title_vectorizer.fit_transform(title_processed).toarray()
    
    # save vectorizers
    pickle.dump(text_vectorizer, open('text_vectorizer.pkl','wb'))
    pickle.dump(title_vectorizer, open('title_vectorizer.pkl','wb'))
    
    # store label then drop old text columns and label
    y = np.array(df.label)
    df.drop(['title','text','label'], inplace=True, axis=1)
    
    # return np matrix
    X = np.concatenate([title_matrix, text_matrix], axis=1)
    return X, y

In [50]:
X, y = preprocess(data)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(31428, 5000)
(13470, 5000)
(31428,)
(13470,)


## Create model

In [12]:
import torch
from torch import nn

In [13]:
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        
        # input layer
        self.l1 = nn.Linear(5000, 2000) # input shape (5008,) -> output to layer 2000 units
        self.relu1 = nn.ReLU()
        
        # hidden layer 1
        self.l2 = nn.Linear(2000, 500)  # input shape 2000 -> output to layer 1000 units
        self.relu2 = nn.ReLU()
        
        # hidden layer 2
        self.l3 = nn.Linear(500, 100)    # input shape 500 -> output to layer 100 units
        self.relu3 = nn.ReLU()
        
        # hidden layer 3
        self.l4 = nn.Linear(100, 20)    # input shape 100 -> output to layer 20 units
        self.relu4 = nn.ReLU()
        
        # output layer
        self.l5 = nn.Linear(20, 2)      # input shape 20 -> output layer 2 units (binary classifier)
        
    def forward(self, X):
        out = self.l1(X)
        out = self.relu1(out)
        
        out = self.l2(out)
        out = self.relu2(out)
        
        out = self.l3(out)
        out = self.relu3(out)
        
        out = self.l4(out)
        out = self.relu4(out)
        
        out = self.l5(out)
        return out

## Optimizer and loss function

In [14]:
model = MLP()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
error = nn.CrossEntropyLoss()

## Training

In [15]:
X_train = torch.Tensor(X_train)
y_train = torch.Tensor(y_train).type(torch.LongTensor)

X_test = torch.Tensor(X_test)
y_test = torch.Tensor(y_test).type(torch.LongTensor)

In [16]:
epochs = 20

for epoch in range(epochs):
    
    # clear gradients
    optimizer.zero_grad()
    
    # forward pass
    out = model(X_train)
    
    # compute loss
    loss = error(out, y_train)
    
    # backprop
    loss.backward()
    
    # update parameters
    optimizer.step()
    
    # print train loss
    print(f'Epoch {epoch} Loss: {loss}')
    

Epoch 0 Loss: 0.695485532283783
Epoch 1 Loss: 0.6704117655754089
Epoch 2 Loss: 0.5986687541007996
Epoch 3 Loss: 0.4957452714443207
Epoch 4 Loss: 0.38455671072006226
Epoch 5 Loss: 0.27917471528053284
Epoch 6 Loss: 0.19155257940292358
Epoch 7 Loss: 0.13460403680801392
Epoch 8 Loss: 0.12006562948226929
Epoch 9 Loss: 0.11622913926839828
Epoch 10 Loss: 0.06967885047197342
Epoch 11 Loss: 0.08273657411336899
Epoch 12 Loss: 0.04991612210869789
Epoch 13 Loss: 0.06637027114629745
Epoch 14 Loss: 0.03500920534133911
Epoch 15 Loss: 0.04353638365864754
Epoch 16 Loss: 0.030849652364850044
Epoch 17 Loss: 0.02082207426428795
Epoch 18 Loss: 0.025929506868124008
Epoch 19 Loss: 0.017286110669374466


## Evaluate model

In [40]:
from sklearn.metrics import accuracy_score
print(X_test[0].shape)
# y_pred = model(torch.Tensor(X_test_single))
# y_pred_max = torch.max(y_pred,1)[1]
# print(y_pred_max)
# test_accuracy = accuracy_score(y_pred_max, y_test)
# print(f'Test accuracy: {test_accuracy}')

(5000,)


In [27]:
import pickle
pickle.dump(model, open('model.pkl', 'wb'))

/kaggle/working
