In [2]:
# ---------------------------
# Hindi Fake News Detection Project
# ---------------------------

import pandas as pd
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Download punkt (only once)
nltk.download('punkt')

# ---------------------------
# Step 0: Define Hindi Stopwords (Custom List)
# ---------------------------
hindi_stopwords = set([
    "‡§î‡§∞","‡§ï‡§æ","‡§ï‡•á","‡§ï‡•Ä","‡§ï‡§ø","‡§ï‡•ã","‡§∏‡•á","‡§§‡•ã","‡§π‡•Ä","‡§™‡§∞","‡§Æ‡•á‡§Ç","‡§Ø‡§π",
    "‡§•‡§æ","‡§•‡•á","‡§•‡•Ä","‡§π‡•à","‡§π‡•à‡§Ç","‡§ó‡§Ø‡§æ","‡§ó‡§à","‡§ó‡§è","‡§π‡•ã","‡§π‡•Å‡§Ü",
    "‡§ï‡§∞","‡§ï‡§∞‡§®‡§æ","‡§ï‡§∞‡§®‡•á","‡§ï‡§ø‡§Ø‡§æ","‡§ï‡§ø‡§è","‡§ú‡§æ‡§®‡§æ","‡§ú‡§æ‡§§‡•Ä","‡§ú‡§æ‡§§‡•á","‡§ú‡§æ‡§§‡§æ",
    "‡§π‡•ã‡§§‡§æ","‡§π‡•ã‡§§‡•Ä","‡§π‡•ã‡§§‡•á","‡§Ö‡§™‡§®‡•á","‡§Ü‡§™","‡§π‡§Æ","‡§π‡§Æ‡§æ‡§∞‡§æ","‡§π‡§Æ‡§æ‡§∞‡•Ä","‡§π‡§Æ‡§æ‡§∞‡•á",
    "‡§§‡•Å‡§Æ","‡§Ü‡§™‡§ï‡§æ","‡§Ü‡§™‡§ï‡•Ä","‡§Ü‡§™‡§ï‡•á","‡§ï‡•ç‡§Ø‡§æ","‡§ï‡•ç‡§Ø‡•ã‡§Ç","‡§ï‡•å‡§®","‡§ï‡§π‡§æ‡§Å","‡§ï‡§¨",
    "‡§Ø‡•á","‡§â‡§®","‡§â‡§®‡§ï‡§æ","‡§â‡§®‡§ï‡•Ä","‡§â‡§®‡§ï‡•á","‡§µ‡§π","‡§µ‡§π‡•Ä","‡§µ‡•á","‡§á‡§∏","‡§á‡§∏‡§ï‡§æ","‡§á‡§∏‡§ï‡•á","‡§á‡§∏‡•á",
    "‡§â‡§∏","‡§â‡§∏‡§ï‡§æ","‡§â‡§∏‡§ï‡•Ä","‡§â‡§∏‡§ï‡•á","‡§∏‡§¨","‡§∏‡§≠‡•Ä","‡§•‡•ã‡§°‡§º‡§æ","‡§ú‡•ç‡§Ø‡§æ‡§¶‡§æ","‡§ï‡§≠‡•Ä","‡§π‡§Æ‡•á‡§∂‡§æ",
    "‡§Ö‡§≠‡•Ä","‡§´‡§ø‡§∞","‡§≠‡•Ä","‡§®‡§π‡•Ä‡§Ç"
])

# ---------------------------
# Step 1: Load Dataset
# ---------------------------
# Make sure you have a CSV file: hindi_fake_news.csv with columns: "text", "label"
# label = 0 (fake), 1 (real)

data = pd.read_csv("hindi_fake_news.csv")

# ---------------------------
# Step 2: Preprocessing Function
# ---------------------------
def clean_text(text):
    # Remove punctuation, numbers, English words
    text = re.sub(r"[^‡§Ä-‡•ø ]", " ", str(text))  
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in hindi_stopwords]
    return " ".join(tokens)

data["clean_text"] = data["text"].apply(clean_text)

# ---------------------------
# Step 3: Split Data
# ---------------------------
X_train, X_test, y_train, y_test = train_test_split(
    data["clean_text"], data["label"], test_size=0.2, random_state=42
)

# ---------------------------
# Step 4: Vectorization
# ---------------------------
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# ---------------------------
# Step 5: Train Model
# ---------------------------
model = MultinomialNB()
model.fit(X_train_vec, y_train)

# ---------------------------
# Step 6: Evaluate
# ---------------------------
y_pred = model.predict(X_test_vec)

print("‚úÖ Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# ---------------------------
# Step 7: Test with New Input
# ---------------------------
def predict_news(news_text):
    cleaned = clean_text(news_text)
    vec = vectorizer.transform([cleaned])
    pred = model.predict(vec)[0]
    return "üì∞ Real News" if pred == 1 else "‚ö†Ô∏è Fake News"

# Example test
print(predict_news("‡§™‡•ç‡§∞‡§ß‡§æ‡§®‡§Æ‡§Ç‡§§‡•ç‡§∞‡•Ä ‡§®‡•á ‡§Ü‡§ú ‡§®‡§à ‡§Ø‡•ã‡§ú‡§®‡§æ ‡§ï‡•Ä ‡§ò‡•ã‡§∑‡§£‡§æ ‡§ï‡•Ä‡•§"))
print(predict_news("‡§ö‡§Æ‡§§‡•ç‡§ï‡§æ‡§∞‡•Ä ‡§§‡•á‡§≤ ‡§≤‡§ó‡§æ‡§®‡•á ‡§∏‡•á ‡§ï‡•ã‡§∞‡•ã‡§®‡§æ ‡§ñ‡§§‡•ç‡§Æ ‡§π‡•ã ‡§ú‡§æ‡§è‡§ó‡§æ‡•§"))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\deven\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


‚úÖ Accuracy: 0.25

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.25      1.00      0.40         1

    accuracy                           0.25         4
   macro avg       0.12      0.50      0.20         4
weighted avg       0.06      0.25      0.10         4

üì∞ Real News
üì∞ Real News


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
