# 01 — Data Preprocessing
This notebook handles:
• Text cleaning
• Removing stop words
• TF-IDF vectorization

In [None]:
import pandas as pd

In [None]:
fake = pd.read_csv("/content/Fake.csv", on_bad_lines="skip")
true = pd.read_csv("/content/True.csv", on_bad_lines="skip")

fake.head()
true.head()

In [None]:
# add additional label (1= fake, 0 = real)
fake['label'] = 1
true['label'] = 0

# simplify by only keeping needed columns
fake = fake[['text', 'label']]
true = true[['text', 'label']]

# combine fake and true
df = pd.concat([fake, true], axis=0).reset_index(drop=True)

# shuffle
df = df.sample(frac=1).reset_index(drop=True)

# check
df.head()
df.shape

In [None]:
# text cleaning and preprocessing
!pip install nltk
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

In [None]:
# defining text cleaning function
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # remove non-alphabetic characters
    text = re.sub('[^a-zA-Z]', ' ', text)

    # lowercase
    text = text.lower()

    # tokenize
    tokens = word_tokenize(text)

    # remove stopwords & lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    # join tokens back into a string
    return " ".join(tokens)

In [None]:
# apply cleaing to dataset

df['clean_text'] = df['text'].apply(clean_text)
df.head()