# Insert Title Here

## Imports

In [None]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk import wordpunct_tokenize
from nltk import download, classify, corpus
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk import ngrams

from collections import Counter
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('vader_lexicon')
nltk.download('words')

## Data Collection

The two Fake News Datasets is loaded and stored into separate DataFrames, df_fakenews1, and df_fakenews2. After reading the datasets, the 'article' column in df_fakenews1 to 'Content'. Similarly, in df_fakenews2, the 'text' column is renamed to 'Content'. Then the two separate DataFrames is merged together, df_fakenews1 and df_fakenews2, into a single DataFrame called df_FakeNews.

In [None]:
df_fakenews1 = pd.read_csv('data/fake news dataset.csv')
df_fakenews1.rename(columns={'article': 'Content'}, inplace=True) #renamed column


df_fakenews2 = pd.read_csv('/data/fake_or_real_news.csv')
df_fakenews2.rename(columns={'text': 'Content'}, inplace=True) #renamed column

#combining two dataset into a single DataFrame
df_FakeNews = pd.concat([df_fakenews1, df_fakenews2], ignore_index=True)

#assigning new values (raplacing the existing values 'REAL' and 'FAKE' with 0 and 1, respectively in the 'label' column)
df_FakeNews['label'] = df_FakeNews['label'].replace({'REAL': 0, 'FAKE': 1})

In [None]:
df_FakeNews['label'] = df_FakeNews['label'].replace({0: 1, 1: 0})
df_FakeNews

The News Sites Datasets is loaded and stored into DataFrame

In [None]:
df_rappler = pd.read_csv('data/rap_dataframe.csv')
df_rappler

In [None]:
df_gma1 = pd.read_csv('data/gma-10000.csv')

df_gma2 = pd.read_csv('data/gma_dataframe.csv')

df_GMA = pd.concat([df_gma1,df_gma2], ignore_index=True)
df_GMA

In [None]:
# Drop duplicates
df_FakeNews = df_FakeNews.drop_duplicates()
df_rappler = df_rappler.drop_duplicates()
df_GMA = df_GMA.drop_duplicates()

In [None]:
display("df_FakeNews",df_FakeNews.head(),
        "df_rappler", df_rappler.head(), 
        "df_GMA", df_GMA.head())

All Datasets is merged together into one DataFrame. After that, any duplicate rows is removed from df_News.

In [None]:
df_news = pd.concat([df_FakeNews,df_rappler, df_GMA], ignore_index=True)
df_news['label'] = pd.to_numeric(df_news['label'], errors='coerce').fillna(0.0)

# Drop duplicates
df_news = df_news.drop_duplicates()
df_news

Data Cleaning and Preprocessing

We dropped all unecessary columns like 'Unnamed: 0', 'Link', 'Author', 'title'.

In [None]:
df_news['label'] = df_news['label'].astype(int)
df_news.drop(["Unnamed: 0", "Link", "Author", "title"], axis=1, inplace=True)
df_news

 Data often contains unwanted characters or formatting that can make it challenging to work with. So, to remove specific characters from a text, we use remove_newline_tab and remove_backslashes to remove  \n, \t, \r, and \r\n\r characters, as well as remove backlash characters from the text.

In [None]:
def remove_newline_tab(text):
    if isinstance(text, str):
        return text.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').replace('\r\n\r', ' ')
    else:
        return text

def remove_backslashes(text):
    if isinstance(text, str):
        return text.replace('\\', '')
    else:
        return text

# Apply preprocessing to 'Content' column in df_News
df_News['Content'] = df_News['Content'].apply(remove_newline_tab)
print(df_News.loc[0, 'Content'])


We also remove any web links or URLs that might be present in the text of the articles. 

In [None]:
# Preprocessing function to remove links from text
def remove_links(text):
    # Regular expression pattern to match URLs
    url_pattern = r'https?://\S+|www\.\S+'
    
    # Replace URLs with an empty string
    return re.sub(url_pattern, '', text)

    # Apply preprocessing to 'Content' column in df_news
df_news['text'] = df_news['text'].astype(str).apply(remove_links)
df_news

Any images that might be embedded in the text is also removed.

In [None]:
def remove_images(text):
    # Define a regular expression pattern to match base64-encoded strings (images)
    base64_pattern = r"data:image\/(png|jpg|jpeg|gif|bmp);base64,[A-Za-z0-9+/=]+"

    # Use the re.sub() function to replace the base64-encoded strings with an empty string
    cleaned_text = re.sub(base64_pattern, '', text)

    return cleaned_text

df_news['text'] = df_news['text'].astype(str).apply(remove_images)
df_news

After cleaning the data, it is then saved into a CSV file.

In [None]:
#Saving of Cleaned Data to CSV file
df_news.to_csv('cleaned_data.csv', index=False)

The data contains articles that is in the English language and Filipino language. With that, we sort these articles into two datasets: one for English articles and another for Filipino articles. We use Natural Language Toolkit library for this. 

In [None]:
# Load the English words corpus from NLTK
english_words = set(words.words())

# Function to detect the language of a text using NLTK
def detect_language(text):
    words = wordpunct_tokenize(text.lower())
    if len(words) == 0:
        return 'fil'
    english_word_count = sum(1 for word in words if word in english_words)
    english_word_rate = english_word_count / len(words)
    
    threshold = 0.75
    
    return 'en' if english_word_rate >= threshold else 'fil'

# Apply the detect_language function to 'Content' column to create a new column 'language'
df_News['language'] = df_News['Content'].apply(detect_language)

# Separate news articles into English and Filipino datasets
df_english_news = df_News[df_News['language'] == 'en']
df_filipino_news = df_News[df_News['language'] == 'fil']

The language column is dropped from the English and Filipino News Datasets as it would not be needed. After that, like the cleaned dataset, it is also saved into CSV files.

In [None]:
# English News Dataset
df_english_news = df_News[df_News['language'] == 'en'].copy()
df_english_news.drop(columns='language', inplace=True)
df_english_news

In [None]:
#Saving of english news to CSV file
df_english_news.to_csv('english_news.csv', index=False)

In [None]:
#Filipino News Dataset
df_filipino_news = df_News[df_News['language'] == 'fil'].copy()
df_filipino_news.drop(columns='language', inplace=True)
df_filipino_news

In [None]:
#Saving of Filipino news to CSV file
df_filipino_news.to_csv('/filipino_news.csv', index=False)

In [None]:
df_news

## Exploratory Data Analysis (EDA)

In [None]:
df_cleaneddata = pd.read_csv('cleaned_data.csv')
df_cleaneddata

In [None]:
df_cleaneddata.describe()

The DataFrame contains 24,041 rows and 2 columns. There are 24,041 non-null entries in the label column. However, there are 2 missing values (non-null count is 24,039 instead of 24,041) in the Content  column, meaning there are no missing values (null values) in this column.

In [None]:
df_cleaneddata.info()

Upon analyzing the "label" column of the DataFrame "df_cleaneddata," which contains information about real and fake news, we can determine the ratio of real and fake news articles.

In [None]:
y = df_cleaneddata ['label']
print(f'Ratio of real and fake news:')
y.value_counts(normalize=True).rename({1: 'real', 0: 'fake'})

In [None]:
df_cleaneddata.isnull().sum().plot(kind="barh")
plt.show()

In [None]:
df_cleaneddata.isnull().sum()

In [None]:
df_cleaneddata.nunique()

We create a histogram to compare the typical lengths of fake and real news articles and identify any potential differences between the two categories.

In [None]:
bins = np.linspace(0, 200, 40)

plt.figure(figsize=(12, 6))
plt.hist(df_cleaneddata[df_cleaneddata["label"] == 1]["text"].str.len(), bins, alpha=0.5, label="Fake", color="#FF5733")
plt.hist(df_cleaneddata[df_cleaneddata["label"] == 0]["text"].str.len(), bins, alpha=0.5, label="Real", color="#33FFB8")

plt.title('Distribution of Text Length for Fake/Real News')
plt.legend(loc='upper right')
plt.grid(True)
plt.show()

Apart from that, we also visualize the distribution of fake news and real news through a bar graph.

In [None]:
class_names = ['fake', 'real'] 
label_count = df_cleaneddata.label.value_counts()
plt.figure(figsize=(8, 6))
sns.barplot(x=label_count.index, y=label_count)
plt.title('Distribution of Fake/Real News',fontsize =14)

In [None]:
y.head()

Two new columns is added, the length column which represent the word count for each entry in the Content column, and Length of Content column which represents the length of each content entry in terms of the number of characters. This is to provide additional information about the content's length (in words and character) for each entry in the DataFrame which could be usedul for futher analysis to better understant the characteristics of the text data.

In [None]:
df_cleaneddata['length'] = df_cleaneddata['Content'].str.count(' ') + 1
df_cleaneddata['LoR'] = df_cleaneddata['Content'].str.len()
df_cleaneddata

df_cleaneddata["length"].mean()

df_cleaneddata["LoR"].mean()

We also visualize the words from the articles where the size of each word corresponds to its frequency in the text. The larger the word, the more frequent it appears in the wordcloud.

In [None]:
titles = ' '.join(title for title in df_cleaneddata['text'])
wordcloud = WordCloud(
    background_color='white', 
    max_words=300,
    width=800, 
    height=400,
).generate(titles)

plt.figure(figsize=(20, 10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
df_cleaneddata['text'] = df_cleaneddata['text'].apply(lambda x: tokenizer.tokenize(x))
print(df_cleaneddata.head())

In [None]:
lemmatizer = WordNetLemmatizer()
df_cleaneddata['text'] = df_cleaneddata['text'].apply(lambda x: [lemmatizer.lemmatize(y) for y in x])
print(df_cleaneddata.head())

In [None]:
df_cleaneddata['text'] = df_cleaneddata['text'].apply(lambda x: ' '.join(x))

In [None]:
all_words = ' '.join([text for text in df_cleaneddata['text']])
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
# Separate fake and real news DataFrames
df_fake_news = df_cleaneddata[df_cleaneddata['label'] == 1]
df_real_news = df_cleaneddata[df_cleaneddata['label'] == 0]

# Combine texts for word clouds
fake_news_text = ' '.join(text for text in df_fake_news['Content'])
real_news_text = ' '.join(text for text in df_real_news['Content'])

# Generate word cloud for fake news
wordcloud_fake = WordCloud(width=800, height=400, background_color='white', max_words=300).generate(fake_news_text)
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud_fake, interpolation='bilinear')
plt.title('Word Cloud for Fake News')
plt.axis('off')
plt.show()

# Generate word cloud for real news
wordcloud_real = WordCloud(width=800, height=400, background_color='white', max_words=300).generate(real_news_text)
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud_real, interpolation='bilinear')
plt.title('Word Cloud for Real News')
plt.axis('off')
plt.show()

## Feature Engineering

## Modeling

## Conclusion

## References