#### 1. Importing Libraries


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

#### 2. Loading the Dataset

We load the dataset containing text data and labels. The dataset has two columns:

- `label`: Indicates whether the news article is real (1) or fake (0).
- `text`: Contains the news article text.


In [2]:
data = pd.read_csv('training_data_lowercase.csv', sep='\t', names=['label', 'text'])

# view data
print(data.shape)
print (data.head(10))

(34152, 2)
   label                                               text
0      0  donald trump sends out embarrassing new year‚s...
1      0  drunk bragging trump staffer started russian c...
2      0  sheriff david clarke becomes an internet joke ...
3      0  trump is so obsessed he even has obama‚s name ...
4      0  pope francis just called out donald trump duri...
5      0  racist alabama cops brutalize black boy while ...
6      0                          fresh off the golf course
7      0  trump said some insanely racist stuff inside t...
8      0   former cia director slams trump over un bullying
9      0  brand-new pro-trump ad features so much a** ki...


#### 3. Divide the data into training and test


In [29]:
data_train, data_val = train_test_split(data, test_size=0.2, random_state=44)

#### 4. Data Preprocessing

We apply several preprocessing steps to clean and prepare the text data for modeling. This includes converting text to lowercase, removing punctuation, digits, and stopwords, and normalizing accented characters.


In [None]:
from utils import remove_and_convert

# Apply the function to the data 
data_train['remove_and_convert'] = data_train['text'].apply(remove_and_convert)
data_val['remove_and_convert'] = data_val['text'].apply(remove_and_convert)

# Check if the data has been cleaned correctly
data_train.head(10)

4.1. Remove Stopwords

Common English stopwords are removed from the text to focus on important words that carry more meaning.


In [31]:
from utils import remove_stopwords

data_train['remove_stopwords'] = data_train['remove_and_convert'].apply(remove_stopwords)
data_val['remove_stopwords'] = data_val['remove_and_convert'].apply(remove_stopwords)

4.2. Tokenization

We tokenize the text by splitting each sentence into words. This step converts the raw text into a list of words (tokens), which will allow further text processing like stemming or lemmatization.


In [None]:
from utils import tokenizer

# Apply the function to the data 
data_train['tokenize'] = data_train['remove_stopwords'].apply(tokenizer)
data_val['tokenize'] = data_val['remove_stopwords'].apply(tokenizer)
data_train.head(10)

4.3. Stemming

We apply stemming to reduce words to their root forms. This helps standardize the text.


In [None]:
from utils import stem_words

# Apply stemming 
data_train['stem_words'] = data_train['tokenize'].apply(stem_words)
data_val['stem_words'] = data_val['tokenize'].apply(stem_words)

data_train.head(20)

4.4. Lemmatization


In [None]:
from utils import lemmatize_words

# Apply lemmatization
data_train['lemmatize_words'] = data_train['tokenize'].apply(lemmatize_words)
data_val['lemmatize_words'] = data_val['tokenize'].apply(lemmatize_words)

data_train.head(20)

#### 5. Exploratory Analysis

Word Frequency Analysis in Real and Fake News

In this section, we analyze the most common words in real and fake news articles. We first split the dataset into real and fake news based on their labels, then count the frequency of words in each category. Finally, we display the top 20 most common words for both real and fake news.


In [None]:
from collections import Counter

# Split the data into real_news and fake_news messages
real_news = data_train[data_train['label'] == 0]['lemmatize_words']
fake_news = data_train[data_train['label'] == 1]['lemmatize_words']

# Count the frequency of each word in the real_news 
real_words = Counter(' '.join(real_news).split())

# Count the frequency of each word in the fake_news 
fake_words = Counter(' '.join(fake_news).split())

# Get the top 20 words in the ham messages
top_real_words = real_words.most_common(20)

# Get the top 20 words in the spam messages
top_fake_words = fake_words.most_common(20)

# Print the results
print("Top 20 words in real news:")
for word, count in top_real_words:
    print(f"{word}: {count}")

print("\nTop 20 words in fake news:")
for word, count in top_fake_words:
    print(f"{word}: {count}")

#### 6. Feature Engineering

6.1 Vectorization

Convert data into vectors so that we can apply the it to a classifier, then vectorize the dataset.


In [None]:
# To make Bag of Words work with Count Vectorizer, we will have to combine the messages so that we can convert it to a vector so that we can apply the it to a classifier.
# We will join every message in a link them with blank spaces. ".iloc" is Purely integer-location based indexing for selection by position. 

headlines = []
for row in range(0,len(data_train.index)):
    headlines.append(' '.join(str(x) for x in data_train.iloc[row,-2:-1])) # change index to choose column 


# Predict for the Test Dataset
test_transform= []
for row in range(0,len(data_val.index)):
    test_transform.append(' '.join(str(x) for x in data_val.iloc[row,-2:-1]))


print (headlines [1])
print (test_transform [1])

6.2 Bag of Words with CountVectorizer


In [None]:
# implement BAG OF WORDS with CountVectorizer
bow_countvector=CountVectorizer(ngram_range=(1,2)) # (ngram_range=(n,n))

# Vectorize the messages dataset
bow_traindataset=bow_countvector.fit_transform(headlines)
bow_test_dataset = bow_countvector.transform(test_transform)

# Print the shape of the vectorized messages
print(bow_traindataset.shape)

6.3 TD-IDF


In [None]:
# Create a Vectorizer  for TF-IDF
tfidfvector = TfidfVectorizer(ngram_range=(1,2))

# Vectorize the messages dataset
tdidf_traindataset = tfidfvector.fit_transform(headlines)
tdidf_test_dataset = tfidfvector.transform(test_transform)

# Print the shape of the vectorized messages
print(tdidf_test_dataset.shape)

#### 7. Implement Classifiers


7.1 Logistic Regression

- **Logistic Regression**: A linear model used for binary classification tasks.


In [None]:
# Fit the Logistic Regression model with bow
logreg=LogisticRegression(C=6, penalty='l2', max_iter=150, tol=0.00001, solver='lbfgs')
logreg.fit(bow_traindataset,data_train['label'])

In [None]:
bow_predictions = logreg.predict(bow_test_dataset)
print ('Logistic Regression with BOW')

# Evaluate the model
matrix=confusion_matrix(data_val['label'],bow_predictions)
print(matrix)
score=accuracy_score(data_val['label'],bow_predictions)
print(f'Accuracy score for Logistic Regression with BOW: {score}') 
report=classification_report(data_val['label'],bow_predictions)
print(report)

In [None]:
# Set up Stratified K-Fold to ensure that class distribution 
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Perform cross-validation (evaluating accuracy)
scores = cross_val_score(logreg, bow_traindataset, data_train['label'], cv=cv, scoring='accuracy')

# Output the accuracy scores and their mean
print(f'Accuracy scores for each of the 10 cross-validation folds: {scores}')
print(f'Mean accuracy score: {scores.mean()}')

In [None]:
# Fit the Logistic Regression model with TF-IDF
logreg.fit(tdidf_traindataset,data_train['label'])

In [None]:
tfidf_predictions = logreg.predict(tdidf_test_dataset) 
print ('Logistic Regression with TF-IDF') 

# Evaluate the model
matrix=confusion_matrix(data_val['label'],tfidf_predictions)
print(matrix)
score=accuracy_score(data_val['label'],tfidf_predictions)
print(f' Accuracy score for Logistic Regression with TF-IDF: {score}') 
report=classification_report(data_val['label'],tfidf_predictions)
print(report)

7.2. Naive Bayes Classifier

1. CountVectorizer
2. TF-IDF


In [None]:
# Fit the Naive Bayes model with bow
naive=MultinomialNB()
naive.fit(bow_traindataset,data_train['label'])

In [None]:
bow_predictions = naive.predict(bow_test_dataset)
print ('Naive Bayes with BOW')

# Evaluate the model
matrix=confusion_matrix(data_val['label'],bow_predictions)
print(matrix)
score=accuracy_score(data_val['label'],bow_predictions)
print(f'Accuracy score for Naive Bayes with BOW: {score}')
report=classification_report(data_val['label'],bow_predictions)
print(report)

In [None]:
# Fit the Naive Bayes model with TF-IDF
naive.fit(tdidf_traindataset,data_train['label'])

In [None]:
tfidf_predictions = naive.predict(tdidf_test_dataset)
print ('Naive Bayes with TF-IDF')

# Evaluate the model
matrix=confusion_matrix(data_val['label'],tfidf_predictions)
print(matrix)
score=accuracy_score(data_val['label'],tfidf_predictions)
print(f'Accuracy score for Naive Bayes with TF-IDF: {score}')
report=classification_report(data_val['label'],tfidf_predictions)
print(report)

7.3. Random Forest Classifier

1. CountVectorizer
2. TF-IDF


In [48]:
# Fit the Random Forest model with bow
rf = RandomForestClassifier()
rf.fit(bow_traindataset,data_train['label'])

KeyboardInterrupt: 

In [None]:
bow_predictions = rf.predict(bow_test_dataset)
print ('Random Forest with BOW')

# Evaluate the model
matrix=confusion_matrix(data_val['label'],bow_predictions)
print(matrix)
score=accuracy_score(data_val['label'],bow_predictions)
print(f' Accuracy score for Random Forest with BOW: {score}') 
report=classification_report(data_val['label'],bow_predictions)
print(report)

In [None]:
# Fit the Random Forest model with TF-IDF
rf.fit(tdidf_traindataset,data_train['label'])

In [None]:
tfidf_predictions = rf.predict(tdidf_test_dataset)
print ('Random Forest with TF-IDF')
# Evaluate the model
matrix=confusion_matrix(data_val['label'],tfidf_predictions)
print(matrix)
score=accuracy_score(data_val['label'],tfidf_predictions)
print(f' Accuracy score for Random Forest with TF-IDF: {score}') 
report=classification_report(data_val['label'],tfidf_predictions)
print(report)

7.4. Decision Tree Classifier

1. CountVectorizer
2. TF-IDF


In [None]:
# Fit the Decision Tree model with bow
dt = DecisionTreeClassifier()
dt.fit(bow_traindataset,data_train['label'])

In [None]:
bow_predictions = dt.predict(bow_test_dataset)
print ('Decision Tree with BOW')

# Evaluate the model
matrix=confusion_matrix(data_val['label'],bow_predictions)
print(matrix)
score=accuracy_score(data_val['label'],bow_predictions)
print(f'Accuracy score for Decision Tree with BOW: {score}')
report=classification_report(data_val['label'],bow_predictions)
print(report)

In [None]:
# Fit the Decision Tree model with TF-IDF
dt.fit(tdidf_traindataset,data_train['label'])

In [None]:
tfidf_predictions = dt.predict(tdidf_test_dataset)
print ('Decision Tree with TF-IDF')

# Evaluate the model
matrix=confusion_matrix(data_val['label'],tfidf_predictions)
print(matrix)
score=accuracy_score(data_val['label'],tfidf_predictions)
print(f'Accuracy score for Decision Tree with TF-IDF: {score}') #print(score)
report=classification_report(data_val['label'],tfidf_predictions)
print(report)

7.5. KNN Classifier

1. CountVectorizer
2. TF-IDF


In [None]:
# Fit the KNN model with bow
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(bow_traindataset,data_train['label'])

In [None]:
bow_predictions = knn.predict(bow_test_dataset)
print ('KNN with BOW')

# Evaluate the model
matrix=confusion_matrix(data_val['label'],bow_predictions)
print(matrix)
score=accuracy_score(data_val['label'],bow_predictions)
print(f' Accuracy score for KNN with BOW: {score}') 
report=classification_report(data_val['label'],bow_predictions)
print(report)

In [None]:
# Fit the KNN model with TF-IDF
knn.fit(tdidf_traindataset,data_train['label'])

In [None]:
tfidf_predictions = knn.predict(tdidf_test_dataset)
print ('KNN with TF-IDF')
# Evaluate the model
matrix=confusion_matrix(data_val['label'],tfidf_predictions)
print(matrix)
score=accuracy_score(data_val['label'],tfidf_predictions)
print(f' Accuracy score for KNN with TF-IDF: {score}')
report=classification_report(data_val['label'],tfidf_predictions)
print(report)

7.6. SVM Classifier

1. CountVectorizer
2. TF-IDF


In [None]:
# Fit the SVM model with TF-IDF
svm = SVC()
svm.fit(tdidf_traindataset,data_train['label'])

In [None]:
bow_predictions = svm.predict(bow_test_dataset)
print ('SVM with BOW')
# Evaluate the model
matrix=confusion_matrix(data_val['label'],bow_predictions) # best prediction 
print(matrix)
score=accuracy_score(data_val['label'],bow_predictions)
print(f'Accuracy score for SVM with BOW: {score}') 
report=classification_report(data_val['label'],bow_predictions)
print(report)

In [None]:
# Fit the SVM model with TF-IDF
svm = SVC()
svm.fit(tdidf_traindataset,data_train['label'])

In [None]:
tfidf_predictions = svm.predict(tdidf_test_dataset)
print ('SVM with TF-IDF')
# Evaluate the model
matrix=confusion_matrix(data_val['label'],tfidf_predictions)
print(matrix)
score=accuracy_score(data_val['label'],tfidf_predictions)
print(f' Accuracy score for SVM with TF-IDF: {score}') 
report=classification_report(data_val['label'],tfidf_predictions)
print(report)

7.7. Super Gradient Boost Classifier

1. CountVectorizer
2. TF-IDF


In [None]:
# Fit the SGBC Classifier with BOW
sgbc = GradientBoostingClassifier()
sgbc.fit(bow_traindataset,data_train['label'])

In [None]:
bow_predictions = sgbc.predict(bow_test_dataset)
print ('SGBC with BOW')

# Evaluate the model
matrix=confusion_matrix(data_val['label'],bow_predictions)
print(matrix)
score=accuracy_score(data_val['label'],bow_predictions)
print(f' Accuracy score for SGBC with BOW: {score}') 
report=classification_report(data_val['label'],bow_predictions)
print(report)

In [None]:
# Fit the SGBC Classifier with TF-IDF
sgbc = GradientBoostingClassifier()
sgbc.fit(tdidf_traindataset,data_train['label'])

In [None]:
tfidf_predictions = sgbc.predict(tdidf_test_dataset)
print ('SGBC with TF-IDF')

# Evaluate the model
matrix=confusion_matrix(data_val['label'],tfidf_predictions)
print(matrix)
score=accuracy_score(data_val['label'],tfidf_predictions)
print(f' Accuracy score for SGBC with TF-IDF: {score}')
report=classification_report(data_val['label'],tfidf_predictions)
print(report)