# Insert Title Here

## Imports

In [None]:
import numpy as np
import pandas as pd

In [None]:
import re
import nltk
from nltk.corpus import stopwords, words
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk import wordpunct_tokenize
from nltk import download, classify, corpus
from nltk.probability import FreqDist
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk import ngrams

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('vader_lexicon')
nltk.download('words')

In [None]:
from collections import Counter
import string

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import WordCloud 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text

## Data Collection

In [None]:
df_fakenews1 = pd.read_csv('data/fake news dataset.csv')
df_fakenews1.rename(columns={'article': 'Content'}, inplace=True) #renamed column

In [None]:
df_fakenews2 = pd.read_csv('data/fake_or_real_news.csv')
df_fakenews2.rename(columns={'text': 'Content'}, inplace=True) #renamed column

In [None]:
#combining two dataset into a single DataFrame
df_FakeNews = pd.concat([df_fakenews1, df_fakenews2], ignore_index=True)

In [None]:
#assigning new values (raplacing the existing values 'REAL' and 'FAKE' with 0 and 1, respectively in the 'label' column)
df_FakeNews.loc[:, 'label'] = df_FakeNews['label'].replace({'REAL': 0, 'FAKE': 1})
df_FakeNews

In [None]:
#Loading news sites datasets and storing into DataFrames
df_rappler = pd.read_csv('data/rap_dataframe.csv')
df_rappler

In [None]:
df_gma1 = pd.read_csv('data/gma-10000.csv')
df_gma2 = pd.read_csv('data/gma_dataframe.csv')

df_GMA = pd.concat([df_gma1,df_gma2], ignore_index=True)
df_GMA

In [None]:
# Drop duplicates
df_FakeNews = df_FakeNews.drop_duplicates()
df_FakeNews

In [None]:
df_rappler = df_rappler.drop_duplicates()
df_rappler

In [None]:
df_GMA = df_GMA.drop_duplicates()
df_GMA

In [None]:
display("df_FakeNews",df_FakeNews.head(),
        "df_rappler", df_rappler.head(), 
        "df_GMA", df_GMA.head())

In [None]:
df_news = pd.concat([df_FakeNews,df_rappler, df_GMA], ignore_index=True)
df_news['label'] = pd.to_numeric(df_news['label'], errors='coerce').fillna(0.0)
df_news

In [None]:
# Drop duplicates
df_news = df_news.drop_duplicates()
df_news

In [None]:
df_news['label'] = df_news['label'].astype(int)
df_news = df_news.drop(["Unnamed: 0", "Link", "Author", "title"], axis=1)
df_news

In [None]:
def remove_newline_tab(text):
    if isinstance(text, str):
        return text.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').replace('\r\n\r', ' ')
    else:
        return text



In [None]:
def remove_backslashes(text):
    if isinstance(text, str):
        return text.replace('\\', '')
    else:
        return text

In [None]:
# Apply preprocessing to 'Content' column in df_News
df_news['Content'] = df_news['Content'].apply(remove_newline_tab)
print(df_news.loc[0, 'Content'])

In [None]:
# Preprocessing function to remove links from text
def remove_links(text):
    # Regular expression pattern to match URLs
    url_pattern = r'https?://\S+|www\.\S+'
    
    # Replace URLs with an empty string
    return re.sub(url_pattern, '', text)

In [None]:
# Apply preprocessing to 'Content' column in df_news
df_news['text'] = df_news['Content'].astype(str).apply(remove_links)
df_news = df_news.drop ('Content', axis = 1)
df_news

In [None]:
def remove_images(text):
    # Define a regular expression pattern to match base64-encoded strings (images)
    base64_pattern = r"data:image\/(png|jpg|jpeg|gif|bmp);base64,[A-Za-z0-9+/=]+"

    # Use the re.sub() function to replace the base64-encoded strings with an empty string
    cleaned_text = re.sub(base64_pattern, '', text)

    return cleaned_text

In [None]:
df_news['text'] = df_news['text'].astype(str).apply(remove_images)
df_news

In [None]:
df_news [df_news.duplicated ()]

In [None]:
df_news = df_news.drop_duplicates ()
df_news = df_news.reset_index (drop = True)
df_news

In [None]:
df_news.isnull().values.any()

In [None]:
english_words = set(words.words())

In [None]:
# Function to detect the language of a text using NLTK
def detect_language(text):
    words = wordpunct_tokenize(text.lower())
    if len(words) == 0:
        return 'fil'
    english_word_count = sum(1 for word in words if word in english_words)
    english_word_rate = english_word_count / len(words)
    
    threshold = 0.5
    
    return 'en' if english_word_rate >= threshold else 'fil'

In [None]:
# Apply the detect_language function to 'Content' column to create a new column 'language'
df_news['language'] = df_news['text'].apply(detect_language)

In [None]:
# Separate news articles into English and Filipino datasets
df_english_news = df_news[df_news['language'] == 'en']
df_filipino_news = df_news[df_news['language'] == 'fil']

In [None]:
# English News Dataset
df_english_news = df_english_news.drop(columns='language')
df_english_news = df_english_news.reset_index (drop = True)
df_english_news

In [None]:
# Filipino News Dataset
df_filipino_news = df_filipino_news.drop(columns='language')
df_filipino_news = df_filipino_news.reset_index (drop = True)
df_filipino_news

In [None]:
# Saving of filipino dataset to CSV file
df_filipino_news.to_csv('filipino_news.csv', index=False)

In [None]:
#Saving of english dataset to CSV file
df_english_news.to_csv('english_news.csv', index=False)

In [None]:
#Saving of Cleaned Data to CSV file
df_news.to_csv('cleaned_data.csv', index = False)

## Exploratory Data Analysis (EDA)

In [None]:
df_cleaneddata = pd.read_csv('cleaned_data.csv')
df_cleaneddata

In [None]:
df_cleaneddata.describe()

In [None]:
df_cleaneddata.info()

In [None]:
df_cleaneddata['label'].value_counts()

In [None]:
y = df_cleaneddata.label
print(f'Ratio of real and fake news:')
y.value_counts(normalize=True).rename({0: 'real', 1: 'fake'})

In [None]:
df_cleaneddata.isnull().sum().plot(kind="barh")
plt.show()

In [None]:
df_cleaneddata.isnull().sum()

In [None]:
df_cleaneddata = df_cleaneddata.fillna('')

In [None]:
df_cleaneddata.nunique()

In [None]:
bins = np.linspace(0, 200, 40)

plt.figure(figsize=(12, 6))
plt.hist(df_cleaneddata[df_cleaneddata["label"] == 1]["Content"].str.len(), bins, alpha=0.5, label="Fake", color="#FF5733")
plt.hist(df_cleaneddata[df_cleaneddata["label"] == 0]["Content"].str.len(), bins, alpha=0.5, label="Real", color="#33FFB8")

plt.title('Distribution of Text Length for Fake/Real News')
plt.legend(loc='upper right')
plt.grid(True)
plt.show()

In [None]:
class_names = ['fake', 'real'] 
label_count = df_cleaneddata.label.value_counts()
plt.figure(figsize=(8, 6))
sns.barplot(x=label_count.index, y=label_count)
plt.title('Distribution of Fake/Real News',fontsize =14)

In [None]:
y.head()

In [None]:
titles = ' '.join(title for title in df_cleaneddata['Content'])
wordcloud = WordCloud(
    background_color='white', 
    max_words=300,
    width=800, 
    height=400,
).generate(titles)

plt.figure(figsize=(20, 10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
df_cleaneddata['Content'] = df_cleaneddata['Content'].apply(lambda x: tokenizer.tokenize(x))
print(df_cleaneddata.head())

In [None]:
lemmatizer = WordNetLemmatizer()
df_cleaneddata['Content'] = df_cleaneddata['Content'].apply(lambda x: [lemmatizer.lemmatize(y) for y in x])
print(df_cleaneddata.head())

In [None]:
df_cleaneddata['Content'] = df_cleaneddata['Content'].apply(lambda x: ' '.join(x))

In [None]:
all_words = ' '.join([text for text in df_cleaneddata['Content']])
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
def to_vector_Tfidf(df, col):
    from sklearn.feature_extraction.text import TfidfVectorizer
    vectorizer = TfidfVectorizer(max_features=2000)
    vectorizer.fit(df[col])
    return vectorizer.transform(df_cleaneddata[col])

In [None]:
text_vector_tfidf = to_vector_Tfidf(df_cleaneddata, 'Content')
print("Shape of the tfidf vector: ", text_vector_tfidf.shape)
print(text_vector_tfidf.shape)

In [None]:
df_cleaneddata = df_cleaneddata[['Content']].copy(deep=True)
df_cleaneddata

In [None]:
df_cleaneddata['length'] = df_cleaneddata['Content'].str.count(' ') + 1
df_cleaneddata['LoR'] = df_cleaneddata['Content'].str.len()
df_cleaneddata

In [None]:
df_cleaneddata["length"].mean()

In [None]:
df_cleaneddata["LoR"].mean()

## Feature Engineering

## Modeling

## Conclusion

## References