In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Downloading stopwords and wordnet
nltk.download('stopwords')
nltk.download('wordnet')

# Loading dataset
data = pd.read_csv('news_dataset.csv')

# Dropping null values
data = data.dropna()

# Cleaning the text
def clean_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\d', '', text)
    text = text.lower()
    text = text.strip()
    return text

data['text'] = data['text'].apply(clean_text)

# Tokenization, removing stop words, and lemmatization
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    words = text.split()
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

data['text'] = data['text'].apply(preprocess)


Step 1: Data Collection

The dataset typically consists of labeled news articles with a binary label indicating whether the news is real or fake. The dataset might have columns such as title, text, label, where label indicates if the news is real or fake.
Step 2: Data Preprocessing

Preprocessing is crucial to clean and prepare the data for analysis. The following steps are involved:

    Removing Null Values: Drop rows with null values as they can affect the analysis.
    Text Cleaning: Remove unnecessary characters, stop words, punctuation, and perform stemming or lemmatization. This step helps in standardizing the text.
    Tokenization: Convert the text into a list of words (tokens).



In [None]:
#feature extraction
# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(data['text']).toarray()
y = data['label']


In [None]:
# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Training the model
model = LogisticRegression()
model.fit(X_train, y_train)


In [None]:
# Predicting the test set results
y_pred = model.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)


Analysis Steps

    Data Understanding: Understand the structure and content of the dataset. Check for class imbalance.
    Data Cleaning: Remove irrelevant characters, stop words, and perform text normalization.
    Feature Engineering: Convert text data into numerical features using TF-IDF vectorization.
    Model Selection: Choose a suitable model for classification. Logistic Regression is a good start due to its simplicity and effectiveness for text classification.
    Model Training and Testing: Train the model on the training set and evaluate it on the test set.
    Performance Evaluation: Use metrics like accuracy, confusion matrix, and classification report to evaluate the model.