In [19]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# 🔹 Download required NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# 🔹 Load stopwords and initialize lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# 🔹 Function to clean text (Stopwords removal, Lowercasing, Lemmatization, Punctuation Removal)
def clean_text(text):
    if isinstance(text, str):  # Make sure it's a string
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation and special chars
        words = word_tokenize(text)  # Tokenize
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Remove stopwords and lemmatize
        return " ".join(words)
    return text

# 🔹 Load your CSV file
file_path = "C:/Users/Dell/Desktop/PROJECTS/NLP-PROJECT/new/archive/final_en.csv"
df = pd.read_csv(file_path)

# 🔹 Apply cleaning to all object (text) columns
text_columns = df.select_dtypes(include=["object"]).columns
for col in text_columns:
    df[col] = df[col].astype(str).map(clean_text)

# 🔹 Save the cleaned DataFrame to a new CSV file
cleaned_file_path = "C:/Users/Dell/Desktop/PROJECTS/NLP-PROJECT/new/archive/cleaned_final_en.csv"
df.to_csv(cleaned_file_path, index=False)

print("✅ Text cleaned and saved to 'cleaned_fake_news_dataset_1000.csv'")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


✅ Text cleaned and saved to 'cleaned_fake_news_dataset_1000.csv'


In [None]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# 🔹 Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# 🔹 Stopwords and lemmatizer setup
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# 🔹 Text cleaning function
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^a-z\s]', '', text)
        words = word_tokenize(text)
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
        return " ".join(words)
    return text

# 🔹 Load dataset
file_path = "C:/Users/Dell/Desktop/PROJECTS/NLP-PROJECT/new/archive/final_en.csv"
df = pd.read_csv(file_path)

# 🔹 Clean all object (text) columns
text_columns = df.select_dtypes(include=["object"]).columns
for col in text_columns:
    df[col] = df[col].astype(str).map(clean_text)

# 🔹 Save cleaned version
cleaned_file_path =  "C:/Users/Dell/Desktop/PROJECTS/NLP-PROJECT/new/archive/final_en.csv"
df.to_csv(cleaned_file_path, index=False)

# 🔹 Define feature column and labels
# Use a specific column like 'content' or 'text' if available
text_column = text_columns[1]  # Or replace with actual, e.g., 'content'
X_text = df[text_column]

# 🔹 Dummy labels (binary: 0 for fake, 1 for real — replace with real ones if you have)
if 'label' in df.columns:
    y = df['label']
else:
    y = [0 if i < len(df) // 2 else 1 for i in range(len(df))]  # Dummy binary labels

# 🔹 TF-IDF vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_features = tfidf.fit_transform(X_text)

# 🔹 Train/test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X_features, y, test_size=0.2, random_state=42, stratify=y
)

print("✅ Dataset cleaned, TF-IDF features extracted, and train/test split completed.")


In [None]:
import pandas as pd
import nltk
import re
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    roc_auc_score, roc_curve
)

# 🔹 Suppress warnings
warnings.filterwarnings("ignore")

# 🔹 Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# 🔹 Setup
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# 🔹 Text cleaning function
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^a-z\s]', '', text)
        words = word_tokenize(text)
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
        return " ".join(words)
    return text

# 🔹 Load dataset
file_path = "C:/Users/Dell/Desktop/PROJECTS/NLP-PROJECT/new/archive/final_en.csv"
df = pd.read_csv(file_path)

# 🔹 Clean all object (text) columns
text_columns = df.select_dtypes(include=["object"]).columns
for col in text_columns:
    df[col] = df[col].astype(str).map(clean_text)

# 🔹 Save cleaned version
df.to_csv(file_path, index=False)

# 🔹 Feature and label selection
text_column = text_columns[3]  # Adjust if needed
X_text = df[text_column]

# 🔹 Dummy labels (replace with real if available)
if 'label' in df.columns:
    y = df['label']
else:
    y = [0 if i < len(df) // 2 else 1 for i in range(len(df))]

# 🔹 TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_features = tfidf.fit_transform(X_text)

# 🔹 Train/Test Split with Stratification
X_train, X_test, y_train, y_test = train_test_split(
    X_features, y, test_size=0.2, random_state=42, stratify=y
)

# 🔹 Check class distribution
print("📊 Class distribution:\n", pd.Series(y).value_counts())

# 🔹 Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# 🔹 Predictions
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]  # For ROC-AUC

# 🔹 Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\n✅ Accuracy: {accuracy:.4f}")

# 🔹 Classification Report
try:
    print("\n📄 Classification Report:\n")
    print(classification_report(y_test, y_pred))
except Exception as e:
    print(f"⚠️ Error generating classification report: {e}")

# 🔹 Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('🧱 Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.show()

# 🔹 ROC Curve and AUC Score
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = roc_auc_score(y_test, y_prob)

plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('📈 ROC Curve')
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()

print(f"\n🎯 ROC-AUC Score: {roc_auc:.4f}")
