In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fake-and-real-news-dataset/True.csv
/kaggle/input/fake-and-real-news-dataset/Fake.csv


# Data Wrangling

In [5]:
import numpy as np
import pandas as pd

In [6]:
data_true=pd.read_csv("/kaggle/input/fake-and-real-news-dataset/True.csv")
data_false=pd.read_csv("/kaggle/input/fake-and-real-news-dataset/Fake.csv")
data= pd.concat([data_true,data_false]).reset_index(drop=True)
data

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"
...,...,...,...,...
44893,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016"
44894,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016"
44895,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016"
44896,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016"


In [7]:
y=[1 for i in range (data_true.shape[0])] +  [0 for i in range(data_false.shape[0])]
y=pd.Series(y)
y

0        1
1        1
2        1
3        1
4        1
        ..
44893    0
44894    0
44895    0
44896    0
44897    0
Length: 44898, dtype: int64

In [8]:
def transform(c) :
    return c["subject"]+ " " +c["title"]+ " "+ c["text"]
data=data.apply(transform,axis=1)
data

0        politicsNews As U.S. budget fight looms, Repub...
1        politicsNews U.S. military to accept transgend...
2        politicsNews Senior U.S. Republican senator: '...
3        politicsNews FBI Russia probe helped by Austra...
4        politicsNews Trump wants Postal Service to cha...
                               ...                        
44893    Middle-east McPain: John McCain Furious That I...
44894    Middle-east JUSTICE? Yahoo Settles E-mail Priv...
44895    Middle-east Sunnistan: US and Allied ‘Safe Zon...
44896    Middle-east How to Blow $700 Million: Al Jazee...
44897    Middle-east 10 U.S. Navy Sailors Held by Irani...
Length: 44898, dtype: object

# Text Preprocessing - Cleaning

In [9]:
from nltk.tokenize import word_tokenize

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer(language="english")

from nltk.stem.porter import PorterStemmer
stemmer2 = PorterStemmer()

from nltk.stem.lancaster import LancasterStemmer
stemmer3 = LancasterStemmer()

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

import string
punct = string.punctuation


In [10]:
def process(s):
    for p in punct:
        s = s.replace(p, '')
    s = s.lower()
    s = word_tokenize(s)
    s = [w for w in s if not w in stop_words] #optional
    s = [stemmer.stem(word) for word in s]
    return s

In [11]:
list(stop_words)[:20]

['am',
 'each',
 'where',
 'be',
 'few',
 'haven',
 'yours',
 'my',
 'if',
 'it',
 'them',
 'being',
 'doesn',
 "hadn't",
 "mightn't",
 'nor',
 'then',
 'has',
 'too',
 'have']

In [12]:
punct

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [13]:
number_of_samples = data.shape[0]

from tqdm import tqdm

for i in tqdm(range(number_of_samples)):
    data[i] = process(data[i])

100%|██████████| 44898/44898 [06:15<00:00, 119.73it/s]


In [14]:
for i in tqdm(range(number_of_samples)):
    data[i] = " ".join(data[i])

100%|██████████| 44898/44898 [00:01<00:00, 33001.37it/s]


In [15]:
data

0        politicsnew us budget fight loom republican fl...
1        politicsnew us militari accept transgend recru...
2        politicsnew senior us republican senat let mr ...
3        politicsnew fbi russia probe help australian d...
4        politicsnew trump want postal servic charg muc...
                               ...                        
44893    middleeast mcpain john mccain furious iran tre...
44894    middleeast justic yahoo settl email privaci cl...
44895    middleeast sunnistan us alli ‘ safe zone ’ pla...
44896    middleeast blow 700 million al jazeera america...
44897    middleeast 10 us navi sailor held iranian mili...
Length: 44898, dtype: object

# Text Representation

In [16]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

## 1 - Bag of Words (BoW)

In [24]:
vectorizer = CountVectorizer(max_features=5000)
bow_data = vectorizer.fit_transform(data).toarray()
features = vectorizer.get_feature_names_out()
bow_data = pd.DataFrame(bow_data, columns=features)

In [25]:
bow_data

Unnamed: 0,10,100,1000,10000,100000,11,12,120,13,130,...,zanupf,zarrab,zealand,zero,zika,zimbabw,zink,zone,zuckerberg,zuma
0,2,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44893,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
44894,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
44895,0,2,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,14,0,0
44896,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


## 2 - Term Frequency - Inverse Document Frequency (TF-IDF)

In [26]:
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_data = vectorizer.fit_transform(data).toarray()
features = vectorizer.get_feature_names_out()
tfidf_data = pd.DataFrame(tfidf_data, columns=features)

In [27]:
tfidf_data

Unnamed: 0,10,100,1000,10000,100000,11,12,120,13,130,...,zanupf,zarrab,zealand,zero,zika,zimbabw,zink,zone,zuckerberg,zuma
0,0.062085,0.000000,0.0,0.0,0.0,0.035354,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
1,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
2,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
3,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
4,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44893,0.028905,0.000000,0.0,0.0,0.0,0.032919,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
44894,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
44895,0.000000,0.019646,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.011733,0.0,0.0,0.0,0.154932,0.0,0.0
44896,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.04811,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0


# Naive Bayes Modelling

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [30]:
def train_and_evaluate(x, y):
    xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2)
    nb.fit(xtrain, ytrain)
    ypred_tr = nb.predict(xtrain)
    ypred_ts = nb.predict(xtest)
    print("Training Results:\n")
    print(classification_report(ytrain, ypred_tr))
    print("\n\nTesting Results:\n")
    print(classification_report(ytest, ypred_ts))


In [31]:
train_and_evaluate(bow_data, y)

Training Results:

              precision    recall  f1-score   support

           0       0.96      0.95      0.96     18782
           1       0.95      0.96      0.96     17136

    accuracy                           0.96     35918
   macro avg       0.96      0.96      0.96     35918
weighted avg       0.96      0.96      0.96     35918



Testing Results:

              precision    recall  f1-score   support

           0       0.96      0.95      0.96      4699
           1       0.95      0.96      0.95      4281

    accuracy                           0.95      8980
   macro avg       0.95      0.96      0.95      8980
weighted avg       0.96      0.95      0.95      8980



In [39]:
def predict_news():
    news_idk = input("Give me the news: ")
    
    # Preprocess the user input news string
    news_idk_processed = process(news_idk)
    news_idk_processed = " ".join(news_idk_processed)
    
    # Transform the preprocessed string into a TF-IDF vector
    news_idk_tfidf = vectorizer.transform([news_idk_processed]).toarray()
    
    # Predict the class of the news string
    prediction = nb.predict(news_idk_tfidf)
    print(f"The prediction for the given news is: {'Real' if prediction[0] == 1 else 'Fake'}")


In [44]:
predict_news()

Give me the news:  Former President Donald Trump was found guilty on 34 felony counts of falsifying business records as part of a scheme to influence the 2016 election by concealing payments he made to an adult-film star. Manhattan District Attorney Alvin Bragg, who guided the prosecution of the case against Trump, announced the verdict reached by a New York State Supreme Court jury on May 30.  The district attorney’s office will provide a memo to Justice Juan Merchan with recommendations for Trump’s sentencing, which is scheduled for July 11.  Bragg’s role in the historic case, in which Trump became the first current or former U.S. president to be convicted of a criminal offense, has made him a target of unsupported claims on social media about his financial holdings.    A widely shared meme posted May 30 on Facebook, showing a photo of Bragg, claims, “Net Worth: $42 Million[.] His Net Worth Has Grown 300% The Past 5 Years[.] He Owns 12 Properties, 8 Cars, and 3 Luxury Yachts[.] How D

The prediction for the given news is: Fake




In [45]:
predict_news()

Give me the news:  Neutral-Atom Breakthrough Surpasses Quantum Error-Correcting Thresholds, A Critical Step to Achieving Quantum Viability in Commercial Applications  BOSTON, MASSACHUSETTS, October 12, 2023 – QuEra Computing, the leader in neutral-atom quantum computers, today announced that a team of researchers from Harvard, MIT and QuEra successfully demonstrated two-qubit entangling gates with an unprecedented 99.5% fidelity on 60 neutral atom qubits in parallel. The quantum breakthrough is the result of an extensive test conducted by Harvard University’s Department of Physics and John A. Paulson School of Engineering and Applied Sciences, QuEra, and MIT’s Department of Physics and Research Laboratory of Electronics. The breakthrough was first reported in ArXiv, and the full research paper can be found here.  Performing entangling quantum operations with low error rates in a scalable fashion is a central element of useful quantum information processing. Neutral atom arrays have rec

The prediction for the given news is: Real


