IMPORT

In [None]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC


PART 1

TASK 1

Cleaning the small dataset

In [32]:

small_data_set = pd.read_csv('https://raw.githubusercontent.com/several27/FakeNewsCorpus/master/news_sample.csv')

def clean_text(text):
    # Convert to lowercase
    text = str(text)
    text = text.lower()

    # Replace URLs
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    text = url_pattern.sub('<URL>', text)

    # Replace emails
    email_pattern = re.compile(r'\S+@\S+')
    text = email_pattern.sub('<EMAIL>', text)

    # Replace dates (YYYY-MM-DD and DD/MM/YYYY formats)
    date_pattern = re.compile(r'(\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4})')
    text = date_pattern.sub('<DATE>', text)

    # Replace numbers
    num_pattern = re.compile(r'\b\d+\b')
    text = num_pattern.sub('<NUM>', text)

    # Remove punctuation and non-word characters
    text = re.sub(r'[^\w\s]', '', text)

    return text 

# Create a new column to store the cleaned text
small_data_set['content_clean'] = small_data_set['content'].apply(clean_text)

# Tokenization
small_data_set['content_tokens'] = small_data_set['content_clean'].apply(word_tokenize)

# Stemming
ps = PorterStemmer()
small_data_set['content_stemming'] = small_data_set['content_tokens'].apply(lambda x: [ps.stem(word) for word in x])

# Stop word removal
stop_words = set(stopwords.words('english'))
small_data_set['clean_content'] = small_data_set['content_stemming'].apply(lambda x: [word for word in x if word not in stop_words])


Counting the number of unique words before, under and after preproccesing

In [None]:
# Raw data
unique_words_content = set()
for text in small_data_set['content']:
    tokens = word_tokenize(text.lower())
    unique_words_content.update(tokens)
num_unique_words_content = len(unique_words_content)
print("Number of unique words in 'content':", num_unique_words_content)

# After stemming
unique_words_stemmed_content = set()
for tokens in small_data_set['content_stemming']:
    unique_words_stemmed_content.update(tokens)
num_unique_words_stemmed_content = len(unique_words_stemmed_content)
print("Number of unique words after stemming:", num_unique_words_stemmed_content)

# Calculate number of unique words in 'clean_content'
unique_words_clean_content = set()
for tokens in small_data_set['clean_content']:
    unique_words_clean_content.update(tokens)
num_unique_words_clean_content = len(unique_words_clean_content)
print("Number of unique words in 'clean_content':", num_unique_words_clean_content)

# calculate reduction rate from raw to stemming
red_raw_to_stem = num_unique_words_content - num_unique_words_stemmed_content
print("Reduction rate from raw data to stemming:",red_raw_to_stem)

red_stem_to_stop = num_unique_words_stemmed_content - num_unique_words_clean_content
print("Reduction rate from stemming to stopwordremoval:", red_stem_to_stop)

TASK 2

Exploration of the big dataset

In [37]:

# We split the file into chunks and then concatenate them
chunksize = 10000
chunks = []
for chunk in pd.read_csv('995,000_rows.csv', chunksize=chunksize):
    chunks.append(chunk)

fake_news = pd.concat(chunks)



In [None]:
#finding missing values in the 'type' column

missing_values = fake_news['type'].isnull().sum()
print('Number of missing values: ', missing_values)

In [None]:

#Distribution of the 'type' column

ax = fake_news['type'].value_counts().plot(kind='bar')

# Setting the x-label, y-label, and title
ax.set_xlabel('Type')
ax.set_ylabel('Percentage')
ax.set_title('Distribution of types')

# Formating the y-axis ticks as percentages
ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=len(fake_news['type'])))


plt.show()


classifying the news into categories and plotting the distribution of the categories

In [None]:
#Classify the news into reliable, fake and other
def classify_news(type):
    reliable_types = ['reliable', 'political', 'clickbait']
    fake_news_types = ['fake', 'hate', 'conspiracy', 'junksci','state','bias']
    
    if type in reliable_types:
        return 'reliable'
    elif type in fake_news_types:
        return 'fake'
    else:
        return 'other'
    
fake_news['news_category'] = fake_news['type'].apply(classify_news)


# The distribution of the news categories
print(fake_news['news_category'].value_counts())

# The distribution of the news categories in percentage
print(fake_news['news_category'].value_counts(normalize=True))

# Plot the percentage of each category
ax = fake_news['news_category'].value_counts().plot(kind='bar')

ax.set_xlabel('Category')
ax.set_ylabel('Percentage')
ax.set_title('Distribution of news categories')

# Format the y-axis ticks as percentages
ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=len(fake_news['news_category'])))

plt.show()

In [None]:
#checking the distribution of the 'type' column in the 'other' category
other_category_fakenews = fake_news[fake_news['news_category'] == 'other']
print(other_category_fakenews['type'].value_counts())

In [None]:
#Investigate the keywords
print(fake_news['keywords'].isnull().sum())

In [None]:
url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

# Count the number of URLs in the 'content' column
url_count = fake_news['content'].str.count(url_pattern).sum()

print("Number of URLs in the 'content' column:", url_count)

#Count dates in the 'content' column

email_pattern = re.compile(r'\S+@\S+')
email_count = fake_news['content'].str.count(email_pattern).sum()
print("Number of emails in the 'content' column:", email_count)

#Count numbers in the 'content' column
num_pattern = re.compile(r'\b\d+\b')
num_count = fake_news['content'].str.count(num_pattern).sum()
print("Number of numbers in the 'content' column:", num_count)

Finding the 100 most used words in the dataset before and after cleaning

In [None]:

def clean_text(text):
    # Convert to lowercase
    text = str(text)
    text = text.lower()

    # Replace URLs
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    text = url_pattern.sub('<URL>', text)

    # Replace emails
    email_pattern = re.compile(r'\S+@\S+')
    text = email_pattern.sub('<EMAIL>', text)

    # Replace dates (YYYY-MM-DD and DD/MM/YYYY formats)
    date_pattern = re.compile(r'(\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4})')
    text = date_pattern.sub('<DATE>', text)

    # Replace numbers
    num_pattern = re.compile(r'\b\d+\b')
    text = num_pattern.sub('<NUM>', text)

    # Remove punctuation and non-word characters
    text = re.sub(r'[^\w\s]', '', text)

    return text 

# Create a new column to store the cleaned text
fake_news['content_clean'] = fake_news['content'].apply(clean_text)

# Tokenization
fake_news['content_tokens'] = fake_news['content_clean'].apply(word_tokenize)

# Stemming
ps = PorterStemmer()
fake_news['content_stemming'] = fake_news['content_tokens'].apply(lambda x: [ps.stem(word) for word in x])

# Stop word removal
stop_words = set(stopwords.words('english'))
fake_news['clean_content'] = fake_news['content_stemming'].apply(lambda x: [word for word in x if word not in stop_words])

fake_news[['id','domain', 'authors', 'type', 'clean_content']].to_csv('cleaned_fake_news.csv', index=False)

In [None]:
from collections import Counter
# Find 100 most frequent word in original content
word_counter = Counter()

for tokens in fake_news['content']:
    if isinstance(tokens, str):
        tokens = tokens.split()
    elif isinstance(tokens, float):
        tokens = str(tokens).split()
    word_counter.update(tokens)

# Get the 100 most common words
most_common_words_100 = word_counter.most_common(100)

# Print the 100 most common words
for word, count in most_common_words_100:
    print(f"'{word}': {count}")

In [None]:
from collections import Counter
fake_news = pd.read_csv('cleaned_fake_news.csv')

word_counter = Counter()

# Update the counter for each word in each token list
for tokens in fake_news['clean_content']:
    if isinstance(tokens, str):
        tokens = tokens.split()
    word_counter.update(tokens)

# Get the 100 most common words
most_common_words_after_cleaning = word_counter.most_common(100)

# Print the 100 most common words
for word, count in most_common_words_after_cleaning:
    print(f"'{word}': {count}")

Plotting the 100 most used words from before and after cleaning

In [None]:
#Find the 100 most common words in original words
most_common_words_10000 = word_counter.most_common(10000)

# Plot the 10000 most common words
x_bar = []
y_bar = []
for i in most_common_words_10000[:100]:
    x_bar.append(i[0])
    y_bar.append(i[1])
plt.figure(figsize=(10, 20))
plt.barh(x_bar, y_bar)

# Set the x-label, y-label, and title
plt.xlabel('Word count')
plt.ylabel('Word')
plt.title('Top 100 Most Common Words')

plt.show()

In [None]:
# Plot the 100 most common words after cleaning
x_bar = []
y_bar = []
for tokens in most_common_words_after_cleaning[:100]:
    x_bar.append(tokens[0])
    y_bar.append(tokens[1])
plt.figure(figsize=(10, 20))
plt.barh(x_bar, y_bar)

plt.xlabel('Word count')
plt.ylabel('Word')
plt.title('Top 100 Most Common Words')

plt.show()

PART 2

classification of data

In [None]:
# Load the data
data = pd.read_csv('cleaned_fake_news.csv')

df = pd.DataFrame(data)

#Classify the news into reliable, fake and other
def classify_news(type):
    reliable_types = ['reliable', 'political', 'clickbait']
    fake_news_types = ['fake', 'hate', 'conspiracy', 'junksci','state','bias']
    
    if type in reliable_types:
        return 'reliable'
    elif type in fake_news_types:
        return 'fake'
    else:
        return 'other'
    
df['news_category'] = df['type'].apply(classify_news)

print(df['news_category'].value_counts())

#Remove 'other' category
df = df[df.news_category != 'other']

print(df['news_category'].value_counts())

Transforming and splitting data

In [56]:
# Using CountVectorizer to convert text data to numbers
vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(df['clean_content'])
y = df['news_category']

y_binary = y.apply(lambda x: 1 if x == 'reliable' else 0)

# Split the data into a training, validation, test
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y_binary, test_size=0.1, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=1/9, random_state=0)

Simple model

In [None]:
# We try logistic regression

model_log = LogisticRegression(max_iter=1000)

# Train the model
model_log.fit(X_train, y_train)

# Predict the labels
y_val_pred = model_log.predict(X_val)

# Print the accuracy
acc_val = accuracy_score(y_val, y_val_pred)
print('Validation accuracy:', acc_val)

# Print the confusion matrix
con_matrix = confusion_matrix(y_val, y_val_pred)
print(con_matrix)

ConfusionMatrixDisplay(con_matrix, display_labels=model_log.classes_).plot(values_format='d')

# Print the classification report
class_report = classification_report(y_val, y_val_pred)
print(class_report)

f1_score_log = f1_score(y_val, y_val_pred)

PART 3

Advanced model

In [None]:
#We try a naive bayes model

model_nb = MultinomialNB()

# Train the model
model_nb.fit(X_train, y_train)

# Predict the labels
y_val_pred = model_nb.predict(X_val)

# Print the accuracy
acc_val_nb = accuracy_score(y_val, y_val_pred)
print('Validation accuracy:', acc_val_nb)

# Print the confusion matrix
con_matrix_nb = confusion_matrix(y_val, y_val_pred)
print(con_matrix_nb)
ConfusionMatrixDisplay(con_matrix_nb, display_labels=model_nb.classes_).plot(values_format='d')

# Print the classification report
class_report_nb = classification_report(y_val, y_val_pred)
print(class_report_nb)

f1_score_nb = f1_score(y_val, y_val_pred)

In [None]:
#We try support a support vector machine model
model_svm = LinearSVC()

# Train the model
model_svm.fit(X_train, y_train)

# Predict the labels
y_val_pred_svm = model_svm.predict(X_val)

# Print the accuracy
acc_val_svm = accuracy_score(y_val, y_val_pred_svm)
print('Validation accuracy:', acc_val_svm)

class_report_svm = classification_report(y_val, y_val_pred_svm)
print(class_report_svm)

conf_matrix_svm = confusion_matrix(y_val, y_val_pred_svm)
print(conf_matrix_svm)

ConfusionMatrixDisplay(conf_matrix_svm, display_labels=model_svm.classes_).plot(values_format='d')
f1_score_svm = f1_score(y_val, y_val_pred)

Evaluating on the test set 

In [69]:
#We evaluate on the test set
# Predict the labels
y_test_pred_log = model_log.predict(X_test)
y_test_pred_nb = model_nb.predict(X_test)
y_test_pred_svm = model_svm.predict(X_test)

#Accurary on the test set
acc_test_log = accuracy_score(y_test, y_test_pred_log)
acc_test_nb = accuracy_score(y_test, y_test_pred_nb)
acc_test_svm = accuracy_score(y_test, y_test_pred_svm)

#F1 score on the test set
f_1_test_log = f1_score(y_test, y_test_pred_log)
f_1_test_nb = f1_score(y_test, y_test_pred_nb)
f_1_test_svm = f1_score(y_test, y_test_pred_svm)

#Confusion matrix on the test set
conf_matrix_test_log = confusion_matrix(y_test, y_test_pred_log)
conf_matrix_test_nb = confusion_matrix(y_test, y_test_pred_nb)
conf_matrix_test_svm = confusion_matrix(y_test, y_test_pred_svm)

In [None]:
#Table of results on test set
results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Naive Bayes', 'Support Vector Machine'],
    'Validation Accuracy': [acc_val, acc_val_nb, acc_val_svm],
    'Test Accuracy': [acc_test_log, acc_test_nb, acc_test_svm],
    'F1 Score': [f1_score_log, f1_score_nb, f1_score_svm],
    'F1 Score Test': [f_1_test_log, f_1_test_nb, f_1_test_svm]
})
# Display the results in a table
from IPython.display import display
display(results)

#Plot the confusion matrices

ConfusionMatrixDisplay(conf_matrix_test_log, display_labels=model_log.classes_).plot(values_format='d')
plt.title('Logistic Regression')
ConfusionMatrixDisplay(conf_matrix_test_nb, display_labels=model_nb.classes_).plot(values_format='d')
plt.title('Naive Bayes')
ConfusionMatrixDisplay(conf_matrix_test_svm, display_labels=model_svm.classes_).plot(values_format='d')
plt.title('Support Vector Machine')

PART 4

We start by cleaning the liar testset, the same way as the fake_news dataset. As we have observed that it gives better results

In [None]:
Liar = pd.read_csv('test.tsv', sep='\t')
print(Liar.columns)

def clean_text(text):
    # Convert to lowercase
    text = str(text)
    text = text.lower()

    # Replace URLs
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    text = url_pattern.sub('<URL>', text)

    # Replace emails
    email_pattern = re.compile(r'\S+@\S+')
    text = email_pattern.sub('<EMAIL>', text)

    # Replace dates (YYYY-MM-DD and DD/MM/YYYY formats)
    date_pattern = re.compile(r'(\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4})')
    text = date_pattern.sub('<DATE>', text)

    # Replace numbers
    num_pattern = re.compile(r'\b\d+\b')
    text = num_pattern.sub('<NUM>', text)

    # Remove punctuation and non-word characters
    text = re.sub(r'[^\w\s]', '', text)

    return text 

# Create a new column to store the cleaned text
Liar['content_clean'] = Liar['Building a wall on the U.S.-Mexico border will take literally years.'].apply(clean_text)

# Tokenization
Liar['content_tokens'] = Liar['content_clean'].apply(word_tokenize)

# Stemming
ps = PorterStemmer()
Liar['content_stemming'] = Liar['content_tokens'].apply(lambda x: [ps.stem(word) for word in x])

# Stop word removal
stop_words = set(stopwords.words('english'))
Liar['clean_content'] = Liar['content_stemming'].apply(lambda x: [word for word in x if word not in stop_words])

Liar[[ 'true', 'clean_content']].to_csv('cleaned_Liar_test.csv', index=False)

In [None]:
from collections import Counter

word_counter = Counter()

# Update the counter for each word in each token list
for tokens in Liar['clean_content']:
    if isinstance(tokens, str):
        tokens = tokens.split()
    word_counter.update(tokens)

# Get the 100 most common words
most_common_words_after_cleaning = word_counter.most_common(100)

# Print the 100 most common words
for word, count in most_common_words_after_cleaning:
    print(f"'{word}': {count}")

In [None]:
# Read the CSV file
Liar = pd.read_csv('test.tsv', sep='\t')

#Define the function to classify news
def classify_news(type):
    reliable_types = ['true']
    fake_news_types = ['false', 'pants-fire']
    
    if type in reliable_types:
       return 'reliable'
    elif type in fake_news_types:
       return 'fake'
    else:
       return 'other'
    
# Apply the classification function to create a new column 'news_category'
Liar['news_category'] = Liar['true'].apply(classify_news)  

# Print the distribution of news categories
print(Liar['news_category'].value_counts())

Liar = Liar[Liar.news_category != 'other']

print(Liar['news_category'].value_counts())

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report

from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
# Read the CSV file
liar = pd.read_csv('cleaned_Liar_test.csv')

#Define the function to classify news
def classify_news_liar(type):
    reliable_types = ['true']
    fake_news_types = ['false', 'pants-fire']
    
    if type in reliable_types:
       return 'reliable'
    elif type in fake_news_types:
       return 'fake'
    else:
       return 'other'
    
# Apply the classification function to create a new column 'news_category'
liar['news_category'] = liar['true'].apply(classify_news_liar)  

# Print the distribution of news categories
print(liar['news_category'].value_counts())

#remove 'other' category
liar = liar[liar.news_category != 'other']

# Test the model on the Liar dataset
X_liar = vectorizer.transform(liar['clean_content'])
y_liar = liar['news_category']

y_liar_binary = y_liar.apply(lambda x: 1 if x == 'reliable' else 0)

# Predict the labels
y_liar_pred_log = model_log.predict(X_liar)
y_liar_pred_svm = model_svm.predict(X_liar)

# Calculate the accuracy
acc_liar_log = accuracy_score(y_liar_binary, y_liar_pred_log)
acc_liar_svm = accuracy_score(y_liar_binary, y_liar_pred_svm)

# Calculate the F1 score
f1_liar_log = f1_score(y_liar_binary, y_liar_pred_log)
f1_liar_svm = f1_score(y_liar_binary, y_liar_pred_svm)

# Calculate the confusion matrix
conf_matrix_liar_log = confusion_matrix(y_liar_binary, y_liar_pred_log)
conf_matrix_liar_svm = confusion_matrix(y_liar_binary, y_liar_pred_svm)

# Display the results in a table
results_liar = pd.DataFrame({
    'Model': ['Logistic Regression', 'Support Vector Machine'],
    'Accuracy': [acc_liar_log, acc_liar_svm],
    'F1 Score': [f1_liar_log, f1_liar_svm]
})
display(results_liar)

# Plot the confusion matrices
ConfusionMatrixDisplay(conf_matrix_liar_log, display_labels=model_log.classes_).plot(values_format='d')
plt.title('LIAR Logistic Regression')

ConfusionMatrixDisplay(conf_matrix_liar_svm, display_labels=model_svm.classes_).plot(values_format='d')
plt.title('LIAR Support Vector Machine')

