#Loading our datasets

In [None]:
from google.colab import drive
from os import chdir

drive.mount("/content/drive")
chdir("/content/drive/MyDrive/my_project1")

In [None]:
#Imports
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import re

In [None]:
legit_news = pd.read_csv("True.csv")
fake_news = pd.read_csv("Fake.csv")

# keep a clean dataset for bonus part
legit_news_unproc = legit_news.copy()
fake_news_unproc = fake_news.copy()

#Getting familiar with the data

In [None]:
legit_news.head()

In [None]:
fake_news.head()

In [None]:
print(legit_news.subject.unique().tolist())
print(fake_news.subject.unique().tolist())

In [None]:
print(legit_news.info(),'\n\n')
print(fake_news.info())

##From the above information we can tell that we have no null values in our datasets

#1) Προεπεξεργασία/καθάρισμα

In [None]:
import nltk
from nltk.corpus.reader.wordnet import NOUN
from nltk.corpus import wordnet

# Importing the WordNetLemmatizer module from nltk.stem
nltk.download('wordnet')

class WordNetLemmatizer(object):
  def __init__(self):
        pass

  def lemmatize(self, word, pos=NOUN):
        lemmas = wordnet._morphy(word, pos)
        return min(lemmas, key=len) if lemmas else word


  def __repr__(self):
        return "<WordNetLemmatizer>"



# unload wordnet
def teardown_module(module=None):
    from nltk.corpus import wordnet

    wordnet._unload()

In [None]:
# Create WordNetLemmatizer object
lemmatizer = WordNetLemmatizer()

# Text pre-processing function using regular expresions and lemmatizer
def tidyText(text):
    # cleaning the text from special characters i.e. punctuation
    text = re.sub(r'[^\w\- ]',' ',text)

    # lemmatize the words
    tokens = text.split()
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    # remove leftover characters
    lemmas = list(filter(lambda token: len(token) > 2, lemmas))
    text = " ".join(lemmas)
    
    return text

In [None]:
title_idx = fake_news.columns.to_list().index('title')
text_idx = fake_news.columns.to_list().index('text')

print(fake_news.iloc[5, title_idx])
print(fake_news.iloc[5, text_idx])

In [None]:
# Convert all text to lowercase and apply pre-processing function
for column in ['title', 'text']:
    fake_news[column] = fake_news[column].str.lower().apply(tidyText)
    legit_news[column] = legit_news[column].str.lower().apply(tidyText)

In [None]:
print(fake_news.iloc[5, title_idx])
print(fake_news.iloc[5, text_idx])

In [None]:
def dropEmptyRows(df):
    empty_rows = df[(df['title']=='') | (df['text']=='')].index.to_list()
    df.drop(labels= empty_rows, inplace=True)

dropEmptyRows(legit_news)
dropEmptyRows(fake_news)

#2) Μελέτη των δεδομένων

##α.

In [None]:
from wordcloud import WordCloud, STOPWORDS
import requests
from PIL import Image, ImageOps

def showWordClouds(mask_path, df):
    # get the image mask that will be used in the word cloud
    mask = Image.open(mask_path)
    mask = ImageOps.grayscale(mask)
    mask = np.array(mask)

    stopwords = set(STOPWORDS).union({"say","may","see","will"})

    # concatenate all the titles from the corresponding column
    words = " ".join(df['title'].to_list())

    fig = plt.figure(figsize = (35, 25), facecolor = None)
    ax = fig.subplots()

    wordcloud = WordCloud(background_color ='black',
                    colormap = 'autumn',
                    stopwords = stopwords,
                    min_font_size = 10,
                    mask= mask).generate(words)  
    
    ax.imshow(wordcloud)
    ax.axis("off")

### Non-fake news wordcloud

In [None]:
showWordClouds('legit.png', legit_news)

### Fake news wordcloud

In [None]:
showWordClouds('fake.png', fake_news)

##β.

In [None]:
# list containing the means of the characters count of row 'title' and 'text'
mean_title = [legit_news['title'].str.len().mean(), fake_news['title'].str.len().mean()]
mean_text = [legit_news['text'].str.len().mean(), fake_news['text'].str.len().mean()]

# plotting the results
fig, ax = plt.subplots(1,2, figsize = (12,6))
categories = ['Legit News', 'Fake News']

ax[0].bar(categories, mean_title, color=(0.2, 0.4, 0.1, 0.6), edgecolor='black')
ax[0].grid(True)
ax[0].set_title('Title column character count mean')
ax[0].set_ylabel('Mean')

ax[1].bar(categories, mean_text, color=(0.5, 0.2, 0.3, 0.6), edgecolor='black')
ax[1].grid(True)
ax[1].set_title('Text column character count mean')
plt.show()

##γ.

In [None]:
df_legit = legit_news.copy()
df_fake = fake_news.copy()

def dropStopwords(df, column):
    stopwords = set(STOPWORDS).union({"say","may","see","will"})
    # get the column as a series
    col = df[column]
    # split the text into list of words
    words = col.str.split()
    # iterate series and filter out stopwords
    filtered_words = words.apply(lambda tokens: list(filter(lambda token: token not in stopwords, tokens)))
    # join the words back into text
    df[column] = filtered_words.str.join(' ')

def getWordCounts(df, rmvStopwrds=False):
    # remove the stopwords if needed
    if rmvStopwrds:
        print(f'Before stopword removal:\n\
\tTitle length: {len(df["title"].iloc[0].split())}\n\
\tcontent: {df["title"].iloc[0]}')
        dropStopwords(df, 'title')
        dropStopwords(df, 'text')
        print(f'After stopword removal:\n\
\tTitle length: {len(df["title"].iloc[0].split())}\n\
\tcontent: {df["title"].iloc[0]}')
    
    df['title_word_count'] = df['title'].str.split().str.len()
    df['text_word_count'] = df['text'].str.split().str.len()

    fig = plt.figure(figsize=(18,10))
    ax = fig.subplots(1,2)
    sns.set_theme(context='talk',
                  font_scale=0.8,
                  palette='Oranges_r' if rmvStopwrds else 'coolwarm')

    plt.sca(ax[0])
    sns.histplot(data=df,
                 x="title_word_count",
                 bins=df['title_word_count'].unique().size,
                 edgecolor='black')
    ax[0].set_xlabel('Word count bins')
    ax[0].set_ylabel('Article count')
    ax[0].set_title('Title distribution plot')

    plt.sca(ax[1])
    sns.histplot(data=df,
                 x="text_word_count", 
                 edgecolor='black',
                 bins=df['text_word_count'].unique().size//16)
    ax[1].set_title('Text distribution plot')
    ax[1].set_xlabel('Word count bins')
    ax[1].set_ylabel('Article count')

###Non-fake news distributions

In [None]:
getWordCounts(df_legit)

###Fake news distributions

In [None]:
getWordCounts(df_fake)

##δ.

###Non-fake news distributions [Stopwords Removed]


In [None]:
getWordCounts(df_legit, rmvStopwrds=True)

###Fake news distributions [Stopwords Removed]


In [None]:
getWordCounts(df_fake, rmvStopwrds=True)

##ε.

In [None]:
from nltk.util import ngrams
from itertools import islice
from collections import Counter
import itertools

df_legit = legit_news.copy()
df_fake = fake_news.copy()

def plotBigrams(df, column, N, axes):
    # filter out stopwords
    dropStopwords(df, column)
    
    col = df[column]
    # split sentences into list of words
    tokens = col.str.split()
    # iterate through series and get bigrams
    bigrams = tokens.apply(lambda tokens_row: list(ngrams(tokens_row, 2)))
    # merge bigrams into list
    bigrams = list(itertools.chain(*bigrams))
    counts = Counter(bigrams)

    df = pd.DataFrame(data=counts.most_common(N), columns=['Bigrams','Count'])
    sns.barplot(data=df, 
                y='Count',
                x='Bigrams',
                ax=axes[1],
                edgecolor= 'black')
    axes[1].set_title(column.capitalize() + ' column barplot', fontsize=18)
    axes[1].tick_params(axis='x', labelrotation=45)
    
    freqs = list(map(lambda item: (item[0][0]+' '+item[0][1], item[1]),\
                     counts.most_common(N)))
    
    # print the wordcloud representation below the barchart
    wordcloud = WordCloud(width=1000, 
                          height=600, 
                          background_color ='black',
                          colormap = 'autumn',
                          min_font_size = 10).generate_from_frequencies(dict(freqs))
    axes[0].set_title(column.capitalize() + ' column wordcloud', fontsize=18)
    axes[0].imshow(wordcloud)
    axes[0].axis("off")

def topN_bigrams(df, N):
    fig = plt.figure(figsize=(28,18))
    # fig.tight_layout(pad=2.0)
    ax = fig.subplots(2,2)
    sns.set_theme(context='talk',
                  font_scale=0.8,
                  palette='deep')

    plotBigrams(df, 'title', N, ax[:,0])
    plotBigrams(df, 'text', N, ax[:,1])

### Non-fake news

In [None]:
topN_bigrams(df_legit, 20)

### Fake news


In [None]:
topN_bigrams(df_fake, 20)

#3) Δημιουργία συνόλου εκμάθησης και δοκιμής

In [None]:
df_legit = legit_news.copy()
df_fake = fake_news.copy()

##Add the labels

In [None]:
# make the new label columns
df_legit['label'] = 1
df_fake['label'] = 0

##Separate both datasets into train and test datasets

In [None]:
# function that splits dataset into two depending on percentage given as input
def splitTrainTest(df, train_percent):
    train_size = int(train_percent * df.shape[0])

    train = df.iloc[:train_size, :]
    test = df.iloc[train_size:, :]

    return train, test

In [None]:
lgt_train, lgt_test = splitTrainTest(df_legit, train_percent=0.9)
fk_train, fk_test = splitTrainTest(df_fake, train_percent=0.9)

In [None]:
def mergeDatasets(df_legit, df_fake, file_name):
    df_merged = pd.concat([df_legit, df_fake], ignore_index=False)
    df_merged.to_csv(file_name, index=True)

In [None]:
mergeDatasets(lgt_train, fk_train, 'train.csv')
mergeDatasets(lgt_test, fk_test, 'test.csv')

In [None]:
train = pd.read_csv('train.csv', index_col=0)
train.label.value_counts()

In [None]:
test = pd.read_csv('test.csv', index_col=0)
test.label.value_counts()

#4) Classification

## Below we filter out any records with invalid date formats in our datasets as we will use this column as a feature

In [None]:
def findInvalidDates(date):
    isInvalid = False
    try:
        pd.to_datetime(date)
    except:
        isInvalid = True

    return isInvalid


test.drop(test[test['date'].apply(findInvalidDates)].index, inplace=True)
train.drop(train[train['date'].apply(findInvalidDates)].index, inplace=True)

## In order to engineer our features so that we can later feed them to our model we will use pre-trained embeddings.

In [None]:
import gensim.downloader as api

glove_vectors = api.load("glove-wiki-gigaword-100")


##Generating embeddings

In [None]:
def textToEmbedding(txt, embeddings, rmvStopwords=True):
    if rmvStopwords:
        stopwords = set(STOPWORDS).union({"say","may","see","will"})
        txt  = [token for token in txt if token not in stopwords]
    
    ret_vec = np.zeros((1,embeddings.vector_size), dtype=float)
    # iterate through words and retrive embedding from pre-trained vectors
    counter = 0
    for token in txt:
        if token not in embeddings.key_to_index.keys():
            continue
        
        ret_vec += embeddings[token]
        counter += 1

    # embedding for whole text is average of embedding of each word
    return (ret_vec / counter) if counter != 0 else ret_vec

In [None]:
def recordsToVecs(df, embeddings):
    for idx, row in enumerate(df.iterrows()):
        record = row[1]

        title_vec = textToEmbedding(record['title'].split(), embeddings)
        text_vec = textToEmbedding(record['text'].split(), embeddings)

        record_vec = np.concatenate((title_vec, text_vec)).reshape(1,-1)

        if idx == 0:
            ret_vec = record_vec
        else:
            ret_vec = np.concatenate((ret_vec, record_vec))

    return ret_vec

In [None]:
# shuffle our dataset
train = train.sample(frac=1)
test = test.sample(frac=1)

In [None]:
X_train_w2v = recordsToVecs(train, glove_vectors)
X_test_w2v = recordsToVecs(test, glove_vectors)

print(f'Train data shape: {X_train_w2v.shape}')
print(f'Test data shape: {X_test_w2v.shape}')

## Creating the bag of words representations

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer = CountVectorizer(max_df=1.0, min_df=1, max_features=200,
stop_words='english', ngram_range=(1,2))

X_train_bow = train['title'] + train['text']
X_test_bow = test['title'] + test['text']

# learn the vocabulary only on training data
X_train_bow = bow_vectorizer.fit_transform(X_train_bow.tolist())
X_test_bow = bow_vectorizer.transform(X_test_bow.tolist())

## Creating the tf-idf representations


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=1, max_features=200,
stop_words='english', ngram_range=(1,2))

X_train_tfidf = train['title'] + train['text']
X_test_tfidf = test['title'] + test['text']

# learn the vocabulary only on training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_tfidf.tolist())
X_test_tfidf = tfidf_vectorizer.transform(X_test_tfidf.tolist())

In [None]:
y_train = train['label']
y_test = test['label']

In [None]:
from sklearn.metrics import f1_score, accuracy_score
from sklearn.preprocessing import normalize

def wrapper(X_train_w2v,
            X_test_w2v,
            X_train_tfidf,
            X_test_tfidf,
            X_train_bow,
            X_test_bow,
            y_train,
            y_test):
    def doClassification(model, data_representation='w2v'):
        nonlocal X_train_w2v
        nonlocal X_test_w2v
        nonlocal X_train_tfidf
        nonlocal X_test_tfidf
        nonlocal X_train_bow
        nonlocal X_test_bow
        nonlocal y_train
        nonlocal y_test

        if data_representation == 'w2v':
            X_train = X_train_w2v
            X_test = X_test_w2v
        elif data_representation == 'tfidf':
            X_train = X_train_tfidf
            X_test = X_test_tfidf
        else:
            X_train = X_train_bow
            X_test = X_test_bow

        
        # train the given model
        model.fit(X_train, y_train)

        # predict labels of test set
        pred = model.predict(X_test)

        # print the results
        print(f"F1 score: {f1_score(y_test, pred).round(2)}")
        print(f"Accuracy: {(100*accuracy_score(y_test, pred)).round(2)} %")


    return doClassification


classify = wrapper(X_train_w2v,
                    X_test_w2v,
                    X_train_tfidf,
                    X_test_tfidf,
                    X_train_bow,
                    X_test_bow,
                    y_train,
                    y_test)

##**Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression

logReg = LogisticRegression(max_iter=300)

###Word2vec

In [None]:
classify(logReg, 'w2v')

###Tf-idf


In [None]:
classify(logReg, 'tfidf')

###Bow


In [None]:
classify(logReg, 'bow')

##**Naive Bayes**

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB

gauss = GaussianNB()
mult = MultinomialNB()

###Word2vec

In [None]:
classify(gauss, 'w2v')

#### The bad result of the naive bayes algorithm in combination with the word embeddings is not a surprise. From theory we have learned that this algorithm makes the assumption that our features are independent, an assumption that is far from true in the vector space we represent our data in.

###The same is true about the other two representations as we can see below

###Tf-idf


In [None]:
classify(mult, 'tfidf')

###Bow


In [None]:
classify(mult, 'bow')

##**SVM**

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV


def findBestParams(X, y):
    param_grid = [{
        'C': [1, 16],
        'kernel': ['linear']
    }, {
    'C': [1, 0.5, 8],
    'kernel': ['rbf'], 
    'gamma': [0.05, 0.1]
    }]

    svm = SVC()

    grid_search = GridSearchCV(svm,
                            param_grid,
                            cv=3,
                            scoring='accuracy',
                            return_train_score=True)
    
    grid_search.fit(X, y)
    
    print(f'Best parameters: {grid_search.best_params_}')
    print(f'Best accuracy: {grid_search.best_score_}')
    
    return grid_search.best_params_

###Word2vec

In [None]:
best_params = findBestParams(X_train_w2v, y_train)

In [None]:
svm = SVC(C=best_params['C'],
          gamma=best_params['gamma'],
          kernel=best_params['kernel'])
classify(svm, 'w2v')

###Tf-idf


In [None]:
best_params = findBestParams(X_train_tfidf, y_train)

In [None]:
svm = SVC(C=best_params['C'],
          gamma=best_params['gamma'],
          kernel=best_params['kernel'])
classify(svm, 'tfidf')

###Bow


In [None]:
best_params = findBestParams(X_train_bow, y_train)

In [None]:
svm = SVC(C=best_params['C'],
          kernel=best_params['kernel'])
classify(svm, 'bow')

###Since the best value for kernel hyperparameter chosen by grid search is radial basis function, we can assume that our classes (i.e. true and fake articles) are non-linearly seperable.

##**Random Forests**

In [None]:
from sklearn.ensemble import RandomForestClassifier

randForest = RandomForestClassifier(n_jobs=-1)

###Word2vec

In [None]:
classify(randForest, 'w2v')

###Tf-idf


In [None]:
classify(randForest, 'tfidf')

###Bow


In [None]:
classify(randForest, 'bow')

#Beat The Benchmark

## In order to achieve higher scores we are going to engineer new features. First of all, fake news articles tend to contain key words that we have detected in our wordclouds earlier. Secondly, fake news articles use a lot of punctuation to attract attention. Finally we will make use of the 'date' column. The intuition behind using the date is that possibly there is no need for fake articles to be published when there are legit articles to be published. In other words fake and legit articles are not published in the same time periods.

In [None]:
def generateNewFeatures(df, fake_news_df, legit_news_df):
    # iterate the dataframe's indices
    for i, (idx,lbl) in enumerate(zip(df.index.to_list(), df.label.to_list())):
        search_df = legit_news_df if lbl == 1 else fake_news_df

        # add the number of question marks and exclamation marks to the vector
        row = search_df.loc[idx, :].copy()
        ttl_punc_count = len(re.sub(r'[^!?]', '', row.title))
        txt_punc_count = len(re.sub(r'[^!?]', '', row.text))

        # add day month and year to the vector
        date = pd.to_datetime(row.date)

        hotwords_ttl = row.title.count('video') + row.title.count('tweet') +\
row.title.count('image') + row.title.count('shock') + row.title.count('watch') +\
row.title.count('news') + row.title.count('fake')
        hotwords_txt = row.text.count('video') + row.text.count('tweet') +\
row.text.count('image') + row.text.count('shock') + row.title.count('watch') +\
row.title.count('news') + row.title.count('fake')

        new_row = np.array([ttl_punc_count,txt_punc_count,
                                date.day,
                                date.month,
                                date.year,
                                1 if hotwords_ttl > 0 else 0,
                                1 if hotwords_txt > 0 else 0
                            ]).reshape(1,-1)

        if i == 0:
            new_features = new_row
        else:
            new_features = np.concatenate((new_features, new_row), axis=0)

    normalised = (new_features - np.mean(new_features, axis=0)) / np.std(new_features, axis=0)
    return normalised

In [None]:
from scipy.sparse import csr_matrix

train_new_features = generateNewFeatures(train, fake_news_unproc, legit_news_unproc)
test_new_features = generateNewFeatures(test, fake_news_unproc, legit_news_unproc)

add_features = (lambda old, new: np.concatenate((old, new), axis = 1))

classify_bonus = wrapper(add_features(X_train_w2v, train_new_features),
                    add_features(X_test_w2v, test_new_features),
                    csr_matrix(add_features(X_train_tfidf.toarray(), train_new_features)),
                    csr_matrix(add_features(X_test_tfidf.toarray(), test_new_features)),
                    csr_matrix(add_features(X_train_bow.toarray(), train_new_features)),
                    csr_matrix(add_features(X_test_bow.toarray(), test_new_features)),
                    y_train,
                    y_test)

## Logistic Regression

In [None]:
classify_bonus(logReg, 'w2v')

In [None]:
classify_bonus(logReg, 'tfidf')

In [None]:
classify_bonus(logReg, 'bow')

### The already good accuracy of the logistic regression models shows a slight increase by 1-2%

## Naive Bayes


In [None]:
classify_bonus(gauss, 'w2v')

### We can see that there is a 4% increase in accuracy of the naive bayes model using word embeddings.

##Random Forests

In [None]:
classify_bonus(randForest, 'w2v')

### The model that is trained with word embeddings shows an increase of accuracy by 6%

In [None]:
classify_bonus(randForest, 'tfidf')

In [None]:
classify_bonus(randForest, 'bow')

### Here we can see slight to none increases in score