# Final Project: Fake News Detection

By Felix Daubner - Hochschule der Medien

Module 'Supervised and Unsupervised Learning' - Prof. Dr.-Ing. Johannes Maucher

## Data Preprocessing

In [1]:
import pandas as pd
import numpy as np

Currently, it's not possible to train a machine learning model using the dataset as in [data understanding](03_data-understanding.ipynb). There are some steps which have to be done before, such as:
- Binarize and encode truth-column
- Convert text (string) to tokens
- Pad all sequences to the same length

In [5]:
data = pd.read_csv("data/scraped.csv", sep=";", index_col=0)

In [6]:
data.head()

Unnamed: 0,statement,issue,truth
0,"Says Sen. Bob Casey, D-Pa., “is trying to chan...",2024-senate-elections,false
1,Says the election results are suspicious becau...,2024-senate-elections,false
2,A “ballot dump” around 4 a.m. in Milwaukee sho...,2024-senate-elections,pants-fire
3,“Kari Lake is threatening Social Security and ...,2024-senate-elections,half-true
4,Republican Senate candidate Sam Brown “wants t...,2024-senate-elections,half-true


#### Binarization of 'truth'

In [8]:
data["truth"].unique()

array(['false', 'pants-fire', 'half-true', 'barely-true', 'mostly-true',
       'true', 'half-flip', 'full-flop', 'no-flip'], dtype=object)

In [23]:
true = ["true", "mostly-true", "half-true"]
false = ["barely-true", "false", "pants-fire"]
flip = ["full-flop", "half-flip", "no-flip"]

In [24]:
data_dropped = data[~data["truth"].isin(flip)]

In [25]:
data_dropped["truth"].unique()

array(['false', 'pants-fire', 'half-true', 'barely-true', 'mostly-true',
       'true'], dtype=object)

In [29]:
data_binary = data_dropped.copy()
data_binary["truth"] = data_binary["truth"].replace(true, 1)
data_binary["truth"] = data_binary["truth"].replace(false, 0)

In [30]:
data_binary["truth"].value_counts()

0    10632
1     6134
Name: truth, dtype: int64

In [31]:
data_binary["truth"].dtype

dtype('int64')

#### Tokenization of 'statement'

...

In [33]:
data_binary["statement"] = data_binary["statement"].str.lower().str.replace(r'[^a-zA-Z0-9 ]',"", regex=True).astype("str")

In [34]:
data_binary

Unnamed: 0,statement,issue,truth
0,says sen bob casey dpa is trying to change the...,2024-senate-elections,0
1,says the election results are suspicious becau...,2024-senate-elections,0
2,a ballot dump around 4 am in milwaukee shows t...,2024-senate-elections,0
3,kari lake is threatening social security and m...,2024-senate-elections,1
4,republican senate candidate sam brown wants to...,2024-senate-elections,1
...,...,...,...
16921,missouri is the state with the lowest paid wor...,workers,1
16922,in 2009 hillary clinton was at the state depa...,workers,1
16923,says bernie sanders fundamentally changed the ...,workers,1
16924,we work longer hours than any people in the in...,workers,0


In [35]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [51]:
NUM_WORDS=3000

In [52]:
token = Tokenizer(num_words=NUM_WORDS)
statements = data_binary["statement"].to_list()
token.fit_on_texts(statements)
data_binary["token"] = token.texts_to_sequences(statements)

In [53]:
data_binary

Unnamed: 0,statement,issue,truth,token
0,says sen bob casey dpa is trying to change the...,2024-senate-elections,0,"[9, 180, 1682, 2513, 8, 482, 2, 262, 1, 4, 1, ..."
1,says the election results are suspicious becau...,2024-senate-elections,0,"[9, 1, 189, 2132, 12, 66, 682, 27, 180, 702, 7..."
2,a ballot dump around 4 am in milwaukee shows t...,2024-senate-elections,0,"[5, 815, 2689, 436, 397, 1238, 3, 260, 48, 1, ..."
3,kari lake is threatening social security and m...,2024-senate-elections,1,"[2036, 1210, 8, 2690, 105, 97, 6, 114]"
4,republican senate candidate sam brown wants to...,2024-senate-elections,1,"[161, 173, 284, 526, 145, 2, 113, 253, 6, 105,..."
...,...,...,...,...
16921,missouri is the state with the lowest paid wor...,workers,1,"[838, 8, 1, 35, 17, 1, 404, 151, 225]"
16922,in 2009 hillary clinton was at the state depa...,workers,1,"[3, 840, 202, 150, 15, 29, 1, 35, 271, 420, 17..."
16923,says bernie sanders fundamentally changed the ...,workers,1,"[9, 635, 689, 872, 1, 486, 4, 809, 225, 3, 159..."
16924,we work longer hours than any people in the in...,workers,0,"[21, 256, 697, 667, 18, 109, 26, 3, 1, 146, 32..."


In [55]:
max(data_binary["token"].apply(len))

57

#### Padding of sequences

In [56]:
padded = pad_sequences(data_binary["token"].to_list())
data_binary["token"] = padded.tolist()

In [60]:
data_binary

Unnamed: 0,statement,issue,truth,token
0,says sen bob casey dpa is trying to change the...,2024-senate-elections,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,says the election results are suspicious becau...,2024-senate-elections,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,a ballot dump around 4 am in milwaukee shows t...,2024-senate-elections,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,kari lake is threatening social security and m...,2024-senate-elections,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,republican senate candidate sam brown wants to...,2024-senate-elections,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...
16921,missouri is the state with the lowest paid wor...,workers,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
16922,in 2009 hillary clinton was at the state depa...,workers,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
16923,says bernie sanders fundamentally changed the ...,workers,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
16924,we work longer hours than any people in the in...,workers,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


#### Prepare data for training