# Insert Title Here

## Imports

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk import ngrams

from collections import Counter
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('vader_lexicon')

## Data Collection

In [None]:
#Loading fake news datasets and storing into DataFrames
df_fakenews1 = pd.read_csv('data/fake news dataset.csv')
df_fakenews1.rename(columns={'article': 'Content'}, inplace=True) #renamed column

df_fakenews2 = pd.read_csv('data/fake_or_real_news.csv')
df_fakenews2.rename(columns={'text': 'Content'}, inplace=True) #renamed column

#combining two dataset into a single DataFrame
df_FakeNews = pd.concat([df_fakenews1, df_fakenews2], ignore_index=True)

#assigning new values (raplacing the existing values 'REAL' and 'FAKE' with 0 and 1, respectively in the 'label' column)
df_FakeNews.loc[:, 'label'] = df_FakeNews['label'].replace({'REAL': 0, 'FAKE': 1})
df_FakeNews

In [None]:
#Loading news sites datasets and storing into DataFrames
df_Rappler = pd.read_csv('rap_dataframe.csv')
df_Rappler

In [None]:
df_gma1 = pd.read_csv('data/gma-10000')

df_gma2 = pd.read_csv('gma_dataframe')

df_GMA = pd.concat([df_gma1,df_gma2], ignore_index=True)
df_GMA

In [None]:
# Drop duplicates
df_FakeNews = df_FakeNews.drop_duplicates()
df_Rappler = df_Rappler.drop_duplicates()
df_GMA = df_GMA.drop_duplicates()

In [None]:
display("df_FakeNews",df_FakeNews.head(),
        "df_Rappler", df_Rappler.head(), 
        "df_GMA", df_GMA.head())

In [None]:
df_News = pd.concat([df_FakeNews,df_Rappler, df_GMA], ignore_index=True)
df_News['label'] = pd.to_numeric(df_News['label'], errors='coerce').fillna(0.0)

# Drop duplicates
df_News = df_News.drop_duplicates()
df_News


In [None]:
# Preprocessing function to remove '\n' and '\t'
def remove_newline_tab(text):
    if isinstance(text, str):
        return text.replace(r'\n', ' ').replace(r'\t', ' ')
    else:
        return text

# Apply preprocessing to 'Content' column in df_News
df_News['Content'] = df_News['Content'].apply(remove_newline_tab)
df_News

In [None]:
# Preprocessing function to remove links from text
def remove_links(text):
    # Regular expression pattern to match URLs
    url_pattern = r'https?://\S+|www\.\S+'
    
    # Replace URLs with an empty string
    return re.sub(url_pattern, '', text)

    # Apply preprocessing to 'Content' column in df_News
df_News['Content'] = df_News['Content'].astype(str).apply(remove_links)
df_News

In [None]:
def remove_images(text):
    # Define a regular expression pattern to match base64-encoded strings (images)
    base64_pattern = r"data:image\/(png|jpg|jpeg|gif|bmp);base64,[A-Za-z0-9+/=]+"

    # Use the re.sub() function to replace the base64-encoded strings with an empty string
    cleaned_text = re.sub(base64_pattern, '', text)

    return cleaned_text

df_News['Content'] = df_News['Content'].astype(str).apply(remove_images)
df_News


In [None]:
#Saving of Cleaned Data to CSV file
df_News.to_csv('cleaned_data.csv', index=False)

## Exploratory Data Analysis (EDA)

In [None]:
df_cleaneddata = pd.read_csv('cleaned_data.csv')
df_cleaneddata

In [None]:
df_cleaneddata.describe()

In [None]:
df_cleaneddata.info()

In [None]:
#Visualizing top 5 authors
d = df_cleaneddata['Author'].value_counts().sort_values(ascending=False).head(5)
d = pd.DataFrame(d)
d = d.reset_index()

sns.set()
plt.figure(figsize=(15,4))
sns.barplot(x='index', y='Author', data=d)
plt.xlabel("\nAuthors")
plt.ylabel("Number of Articles written")
plt.xticks(rotation=90)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
y = df_cleaneddata.label
print(f'Ratio of real and fake news:')
y.value_counts(normalize=True).rename({1: 'real', 0: 'fake'})

In [None]:
df_cleaneddata.drop(["Unnamed: 0", "Link", "Author"], axis=1, inplace=True)

In [None]:
df_cleaneddata.isnull().sum().plot(kind="barh")
plt.show()

In [None]:
df_cleaneddata.isnull().sum()

In [None]:
df_cleaneddata.isnull().sum()

In [None]:
df_cleaneddata.nunique()

In [None]:
df_cleaneddata["title_Content"] = df_cleaneddata["title"] + df_cleaneddata["Content"]
df_cleaneddata["body_len"] = df_cleaneddata["title_Content"].apply(lambda x: len(x) - x.count(" "))
df_cleaneddata.head()

In [None]:
bins = np.linspace(0, 200, 40)

plt.figure(figsize=(12, 6))
plt.hist(df_cleaneddata[df_cleaneddata["label"]== 1]["body_len"], bins, alpha=0.5, label="Fake", color="#FF5733")
plt.hist(df_cleaneddata[df_cleaneddata["label"]== 0]["body_len"], bins, alpha=0.5, label="Real", color="#33FFB8")
plt.legend(loc="upper left")
plt.show()

In [None]:
class_names = ['fake', 'real'] 
label_count = df_cleaneddata.label.value_counts()
plt.figure(figsize=(8, 6))
sns.barplot(x=label_count.index, y=label_count)
plt.title('Distribution of Fake/Real News',fontsize =14)

In [None]:
y.head()

In [None]:
titles = ' '.join(title for title in df_cleaneddata['title'])
wordcloud = WordCloud(
    background_color='white', 
    max_words=300,
    width=800, 
    height=400,
).generate(titles)

plt.figure(figsize=(20, 10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
df_cleaneddata['Content'] = df_cleaneddata['Content'].apply(lambda x: tokenizer.tokenize(x))
print(df_cleaneddata.head())

In [None]:
lemmatizer = WordNetLemmatizer()
df_cleaneddata['Content'] = df_cleaneddata['Content'].apply(lambda x: [lemmatizer.lemmatize(y) for y in x])
print(df_cleaneddata.head())

In [None]:
df_cleaneddata['Content'] = df_cleaneddata['Content'].apply(lambda x: ' '.join(x))

In [None]:
all_words = ' '.join([text for text in df_cleaneddata['Content']])
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
def to_vector_Tfidf(df, col):
    from sklearn.feature_extraction.text import TfidfVectorizer
    vectorizer = TfidfVectorizer(max_features=2000)
    vectorizer.fit(df[col])
    return vectorizer.transform(df_cleaneddata[col])

In [None]:
text_vector_tfidf = to_vector_Tfidf(df_cleaneddata, 'Content')
print("Shape of the tfidf vector: ", text_vector_tfidf.shape)
print(text_vector_tfidf.shape)

In [None]:
df_cleaneddata = df_cleaneddata[['Content']].copy(deep=True)
df_cleaneddata

In [None]:
df_cleaneddata['length'] = df_cleaneddata['Content'].str.count(' ') + 1
df_cleaneddata['LoR'] = df_cleaneddata['Content'].str.len()
df_cleaneddata

In [None]:
df_cleaneddata["length"].mean()

In [None]:
df_cleaneddata["LoR"].mean()

## Feature Engineering

## Modeling

## Conclusion

## References