# Libraries

In [2]:
# Data management
import pandas as pd
import numpy as np

# scikit-learn
# -----------------------
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, classification_report, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

# features preprocessing and engineering
# -----------------------
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


# Dataset

In [3]:
fake_df=pd.read_csv('../data/Fake.csv')
true_df=pd.read_csv('../data/True.csv')

In [4]:
fake_df.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [5]:
fake_df.info(), true_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    23481 non-null  object
 1   text     23481 non-null  object
 2   subject  23481 non-null  object
 3   date     23481 non-null  object
dtypes: object(4)
memory usage: 733.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
dtypes: object(4)
memory usage: 669.4+ KB


(None, None)

# Data Preprocessing

In [6]:
fake_df.drop(['subject', 'date'], axis=1, inplace=True)
true_df.drop(['subject', 'date'], axis=1, inplace=True)

In [7]:
fake_df['class'] = 0
true_df['class'] = 1

In [8]:
news_df = pd.concat([fake_df, true_df], ignore_index=True, sort=False)

In [9]:
news_df['text'] = news_df['title'] + news_df['text']
news_df.drop(['title'], axis=1, inplace=True)

In [16]:
def wordopt(text):
    text = text.lower()                               # Convert to lowercase
    text = re.sub(r'https?://\S+|www\.\S+', '', text) # Remove URLs
    text = re.sub(r'\W', ' ', text)                   # Replace non-word characters with a space
    text = re.sub(r'\n', '', text)                    # Remove newline characters
    text = re.sub(r' +', ' ', text)                   # Replace multiple spaces with a single space
    text = re.sub(r'^ ', '', text)                    # Remove leading space
    text = re.sub(r' $', '', text)                    # Remove trailing space
    return text

news_df['text'] = news_df['text'].apply(wordopt)

# Feature Engineering

In [18]:
# TF-IDF
vectorization = TfidfVectorizer()
vect_news_df = vectorization.fit_transform(news_df['text'])

# Model building and evaluation

In [22]:
X_train, X_test, y_train, y_test = train_test_split(vect_news_df, news_df['class'], test_size=0.2, random_state=24)

In [27]:
def model_evaluation(model):
    
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1score = f1_score(y_test, y_pred)
    cm_matrix = confusion_matrix(y_test, y_pred)

    print("Evaluation Metrics:")
    print("-------------------")
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1score:.4f}")

    print("\nConfusion Matrix:")
    print("-----------------")
    print(f"True Positive:   {cm_matrix[1, 1]}")
    print(f"True Negative:   {cm_matrix[0, 0]}")
    print(f"False Positive:  {cm_matrix[0, 1]}")
    print(f"False Negative:  {cm_matrix[1, 0]}")



### Logistic Regression

In [25]:
lr = LogisticRegression()
lr.fit(X_train,y_train)

In [28]:
model_evaluation(lr)

Evaluation Metrics:
-------------------
Accuracy:  0.9861
Precision: 0.9842
Recall:    0.9863
F1 Score:  0.9853

Confusion Matrix:
-----------------
True Positive:   4185
True Negative:   4670
False Positive:  67
False Negative:  58


### Decision Tree Classifier

In [29]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

In [30]:
model_evaluation(dtc)

Evaluation Metrics:
-------------------
Accuracy:  0.9957
Precision: 0.9969
Recall:    0.9939
F1 Score:  0.9954

Confusion Matrix:
-----------------
True Positive:   4217
True Negative:   4724
False Positive:  13
False Negative:  26


### AdaBoostClassifier

In [31]:
adb = AdaBoostClassifier()
adb.fit(X_train, y_train)

In [32]:
model_evaluation(adb)

Evaluation Metrics:
-------------------
Accuracy:  0.9967
Precision: 0.9969
Recall:    0.9960
F1 Score:  0.9965

Confusion Matrix:
-----------------
True Positive:   4226
True Negative:   4724
False Positive:  13
False Negative:  17


# Save the model

In [33]:
# Import the pickle module
import pickle

# Save the linear regressor model 
pickle.dump(lr, open('../models/LinearRegressor.pkl', 'wb'))

# Save the DecisionTreeClassifier model 
pickle.dump(dtc, open('../models/DecisionTreeClassifier.pkl', 'wb'))

# Save the AdaBoostClassifier 
pickle.dump(adb, open('../models/AdaBoostClassifier.pkl', 'wb'))

# Save the vectorization model
pickle.dump(vectorization, open('../models/vectorization.pkl', 'wb'))


# Testing the model with a single input

In [34]:
# Load the vectorization model
vect_model = pickle.load(open('../models/vectorization.pkl', 'rb'))

# Load the classiier model
adb_model = pickle.load(open('../models/AdaBoostClassifier.pkl', 'rb'))

In [35]:
def test(text):
    text = wordopt(text)
    text = vect_model.transform(pd.Series(text))
    prediction = adb_model.predict(text)

    if prediction == 0:
        return 'Fake News'
    elif prediction == 1:
        return 'Real News'

In [36]:
# Example: Indonesia to buy $1.14 billion worth of Russian jetsJAKARTA (Reuters) - Indonesia will buy 11 Sukhoi fighter jets worth $1.14 billion from Russia in exchange for cash and Indonesian commodities, two cabinet ministers said on Tuesday. The Southeast Asian country has pledged to ship up to $570 million worth of commodities in addition to cash to pay for the Suhkoi SU-35 fighter jets, which are expected to be delivered in stages starting in two years. Indonesian Trade Minister Enggartiasto Lukita said in a joint statement with Defence Minister Ryamizard Ryacudu that details of the type and volume of commodities were  still being negotiated . Previously he had said the exports could include palm oil, tea, and coffee. The deal is expected to be finalised soon between Indonesian state trading company PT Perusahaan Perdangangan Indonesia and Russian state conglomerate Rostec. Russia is currently facing a new round of U.S.-imposed trade sanctions. Meanwhile, Southeast Asia s largest economy is trying to promote its palm oil products amid threats of a cut in consumption by European Union countries. Indonesia is also trying to modernize its ageing air force after a string of military aviation accidents. Indonesia, which had a $411 million trade surplus with Russia in 2016, wants to expand bilateral cooperation in tourism, education, energy, technology and aviation among others.
news = str(input())
test(news)

'Real News'