In [1]:
import pandas as pd
df = pd.read_csv('Fake.csv')
df['label'] = 0  # Fake news

df_real = pd.read_csv('True.csv')
df_real['label'] = 1  # Real news

data = pd.concat([df, df_real], axis=0).sample(frac=1).reset_index(drop=True)


In [2]:
data.head()

Unnamed: 0,title,text,subject,date,label
0,Obamacare Vs Trumpcare Meme DESTROYS GOP’s He...,Donald Trump and the Republican Party s push t...,News,"March 7, 2017",0
1,UPDATE: WHY UNIV OF MICHIGAN REPLACED SCHEDULE...,HUGE NEWS: The head coach of the University of...,politics,"Apr 8, 2015",0
2,Suicide attack targets area southeast of Baghdad,BAGHDAD (Reuters) - Two attackers shot several...,worldnews,"November 27, 2017",1
3,PANIC! REUTERS/IPSOS Will Radically Change Way...,B..b..but That can t be right Maybe we should ...,politics,"Jul 30, 2016",0
4,U.N. refugee agency hopes resettlement resumes...,GENEVA (Reuters) - The United Nations refugee ...,politicsNews,"January 30, 2017",1


In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.lower().split()
    text = [stemmer.stem(word) for word in text if word not in stop_words]
    return ' '.join(text)

data['text_clean'] = data['text'].apply(clean_text)



[nltk_data] Downloading package stopwords to C:\Users\Genius
[nltk_data]     Khunte\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
data['text_clean']

0        donald trump republican parti push repeal repl...
1        huge news head coach univers michigan footbal ...
2        baghdad reuter two attack shot sever civilian ...
3        b b right mayb chang way poll reuter ipso poll...
4        geneva reuter unit nation refuge agenc voic al...
                               ...                        
44893                                                     
44894               http www youtub com watch v rur pyr ax
44895                                                     
44896    austin texa reuter bill would restrict access ...
44897    stori liber new york time hyster rex tillerson...
Name: text_clean, Length: 44898, dtype: object

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['text_clean'])
y = data['label']


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

model = RandomForestClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9977728285077951
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4720
           1       1.00      1.00      1.00      4260

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980



In [8]:
import pickle
pickle.dump(model, open('fake_news_model.pkl', 'wb'))
pickle.dump(vectorizer, open('vectorizer.pkl', 'wb'))
