<a href="https://colab.research.google.com/github/fjadidi2001/fake_news_detection/blob/main/DANES.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle


In [None]:

# Load dataset
df = pd.read_csv('facebook-fact-check.csv')

# Assume 'text' contains news content and 'label' is the target (fake/real)
texts = df['text'].fillna('').values  # Handle missing text
labels = df['label'].values

# Text preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    return text.strip()

cleaned_texts = [clean_text(t) for t in texts]

# Tokenization and padding
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(cleaned_texts)
sequences = tokenizer.texts_to_sequences(cleaned_texts)
padded_sequences = pad_sequences(sequences, maxlen=200, padding='post', truncating='post')

# Social context preprocessing (adjust columns based on your dataset)
social_features = df[['shares', 'likes', 'user_followers', 'user_verified']].copy()

# Handle missing data (example: fill numerical NaNs with median)
social_features['shares'].fillna(social_features['shares'].median(), inplace=True)
social_features['likes'].fillna(social_features['likes'].median(), inplace=True)
social_features['user_followers'].fillna(social_features['user_followers'].median(), inplace=True)
social_features['user_verified'].fillna(False, inplace=True)  # Assume unverified if missing

# Preprocess numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['shares', 'likes', 'user_followers']),
        ('cat', OneHotEncoder(), ['user_verified'])
    ])
processed_social = preprocessor.fit_transform(social_features)

# Split data into training and testing sets
X_text_train, X_text_test, X_social_train, X_social_test, y_train, y_test = train_test_split(
    padded_sequences, processed_social, labels, test_size=0.2, random_state=42
)

# Save preprocessed data
np.save('X_text_train.npy', X_text_train)
np.save('X_text_test.npy', X_text_test)
np.save('X_social_train.npy', X_social_train)
np.save('X_social_test.npy', X_social_test)
np.save('y_train.npy', y_train)
np.save('y_test.npy', y_test)

# Save preprocessing artifacts
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)
with open('preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)