In [41]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, f1_score

In [42]:
real = pd.read_csv("True.csv")
fake = pd.read_csv("Fake.csv")

real['label'] = 0
fake['label'] = 1

df = pd.concat([real, fake], axis=0)

df = df.sample(frac=1, random_state=42).reset_index(drop=True)

df.head()

Unnamed: 0,title,text,subject,date,label
0,BREAKING: GOP Chairman Grassley Has Had Enoug...,"Donald Trump s White House is in chaos, and th...",News,"July 21, 2017",1
1,Failed GOP Candidates Remembered In Hilarious...,Now that Donald Trump is the presumptive GOP n...,News,"May 7, 2016",1
2,Mike Pence’s New DC Neighbors Are HILARIOUSLY...,Mike Pence is a huge homophobe. He supports ex...,News,"December 3, 2016",1
3,California AG pledges to defend birth control ...,SAN FRANCISCO (Reuters) - California Attorney ...,politicsNews,"October 6, 2017",0
4,AZ RANCHERS Living On US-Mexico Border Destroy...,Twisted reasoning is all that comes from Pelos...,politics,"Apr 25, 2017",1


In [43]:
df['News'] = df['title'] + " " + df['text']
df = df.drop(columns=['title', 'text', 'date', 'subject'])
df.head()

Unnamed: 0,label,News
0,1,BREAKING: GOP Chairman Grassley Has Had Enoug...
1,1,Failed GOP Candidates Remembered In Hilarious...
2,1,Mike Pence’s New DC Neighbors Are HILARIOUSLY...
3,0,California AG pledges to defend birth control ...
4,1,AZ RANCHERS Living On US-Mexico Border Destroy...


In [44]:
df['News'] = df['News'].str.lower()
df['News']

0         breaking: gop chairman grassley has had enoug...
1         failed gop candidates remembered in hilarious...
2         mike pence’s new dc neighbors are hilariously...
3        california ag pledges to defend birth control ...
4        az ranchers living on us-mexico border destroy...
                               ...                        
44893    nigeria says u.s. agrees delayed $593 million ...
44894    boiler room #62 – fatal illusions tune in to t...
44895    atheists sue governor of texas over display on...
44896    republican tax plan would deal financial hit t...
44897    u.n. refugee commissioner says australia must ...
Name: News, Length: 44898, dtype: object

In [45]:
df['News'] = df['News'].apply(word_tokenize)
df['News']

0        [breaking, :, gop, chairman, grassley, has, ha...
1        [failed, gop, candidates, remembered, in, hila...
2        [mike, pence, ’, s, new, dc, neighbors, are, h...
3        [california, ag, pledges, to, defend, birth, c...
4        [az, ranchers, living, on, us-mexico, border, ...
                               ...                        
44893    [nigeria, says, u.s., agrees, delayed, $, 593,...
44894    [boiler, room, #, 62, –, fatal, illusions, tun...
44895    [atheists, sue, governor, of, texas, over, dis...
44896    [republican, tax, plan, would, deal, financial...
44897    [u.n., refugee, commissioner, says, australia,...
Name: News, Length: 44898, dtype: object

In [46]:
stop_words = set(stopwords.words("english"))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [47]:
for i in range(len(df)):
    words = df.loc[i, 'News']
    filtered = []
    for w in words:
        if w.isalpha() and w not in stop_words:
            filtered.append(w)

    df.at[i, 'News'] = filtered

df['News']

0        [breaking, gop, chairman, grassley, enough, de...
1        [failed, gop, candidates, remembered, hilariou...
2        [mike, pence, new, dc, neighbors, hilariously,...
3        [california, ag, pledges, defend, birth, contr...
4        [az, ranchers, living, border, destroy, nancy,...
                               ...                        
44893    [nigeria, says, agrees, delayed, million, figh...
44894    [boiler, room, fatal, illusions, tune, alterna...
44895    [atheists, sue, governor, texas, display, capi...
44896    [republican, tax, plan, would, deal, financial...
44897    [refugee, commissioner, says, australia, must,...
Name: News, Length: 44898, dtype: object

In [48]:
lem = WordNetLemmatizer()

for i in range(len(df)):
    words = df.loc[i, 'News']
    lemmatized = []

    for w in words:
        lemmatized.append(lem.lemmatize(w, "v"))

    df.at[i, 'News'] = lemmatized 

df.head()

Unnamed: 0,label,News
0,1,"[break, gop, chairman, grassley, enough, deman..."
1,1,"[fail, gop, candidates, remember, hilarious, m..."
2,1,"[mike, pence, new, dc, neighbor, hilariously, ..."
3,0,"[california, ag, pledge, defend, birth, contro..."
4,1,"[az, ranchers, live, border, destroy, nancy, p..."


In [56]:
texts = ["".join(tokens) for tokens in df['News']]

In [57]:
vectorizer = TfidfVectorizer(max_features=5000)

In [58]:
X = vectorizer.fit_transform(texts)
y = df['label']

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [61]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [62]:
y_pred = model.predict(X_test)

In [64]:
print(f"Classification Report: \n{classification_report(y_test, y_pred)}")
print(f"Accuracy: {accuracy_score(y_test, y_pred)*100}")
print(f"F1 Score: {f1_score(y_test, y_pred)*100}")

Classification Report: 
              precision    recall  f1-score   support

           0       0.58      1.00      0.73      4284
           1       1.00      0.34      0.50      4696

    accuracy                           0.65      8980
   macro avg       0.79      0.67      0.62      8980
weighted avg       0.80      0.65      0.61      8980

Accuracy: 65.28953229398664
F1 Score: 50.32669322709163


In [66]:
label_map = {
    0: "Real",
    1: "Fake"
}

while True:
    user_input = input("Enter a news article (or type 'quit' to exit): ")
    if user_input.lower() == "quit":
        print("Exiting... Goodbye!")
        break

    user_vec = vectorizer.transform([user_input])

    prediction = model.predict(user_vec)[0]

    category = label_map.get(prediction, "Unknown")
    print(f"Predicted Category: {category}\n")

Predicted Category: Real

Predicted Category: Real

Predicted Category: Real

Predicted Category: Real

Exiting... Goodbye!
