In [7]:
import pandas as pd
import numpy as np
import zipfile
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [8]:
zip_path = "/content/news.zip"  # Update if the path differs
extract_path = "/content/news_data"

In [9]:
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

In [10]:
for file in os.listdir(extract_path):
    if file.endswith(".csv"):
        dataset_path = os.path.join(extract_path, file)
        break

In [11]:
df = pd.read_csv(dataset_path)

In [12]:
print("Dataset Sample:")
print(df.head())

Dataset Sample:
   Unnamed: 0                                              title  \
0        8476                       You Can Smell Hillary’s Fear   
1       10294  Watch The Exact Moment Paul Ryan Committed Pol...   
2        3608        Kerry to go to Paris in gesture of sympathy   
3       10142  Bernie supporters on Twitter erupt in anger ag...   
4         875   The Battle of New York: Why This Primary Matters   

                                                text label  
0  Daniel Greenfield, a Shillman Journalism Fello...  FAKE  
1  Google Pinterest Digg Linkedin Reddit Stumbleu...  FAKE  
2  U.S. Secretary of State John F. Kerry said Mon...  REAL  
3  — Kaydee King (@KaydeeKing) November 9, 2016 T...  FAKE  
4  It's primary day in New York and front-runners...  REAL  


In [13]:
df.rename(columns={'your_text_column': 'text', 'your_label_column': 'label'}, inplace=True)

In [14]:
df['label'] = df['label'].map({'FAKE': 0, 'REAL': 1})  # Adjust according to your dataset

In [15]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.25, random_state=7)

In [16]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [17]:
model = PassiveAggressiveClassifier(max_iter=50)
model.fit(X_train_tfidf, y_train)

In [18]:
y_pred = model.predict(X_test_tfidf)

In [19]:
acc = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {acc * 100:.2f}%")

Model Accuracy: 92.11%


In [20]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Confusion Matrix:
 [[746  62]
 [ 63 713]]
