# Loading Packages

In [None]:
#!pip install simpletransformers
#!pip install torch torchvision
#!pip install nltk
#import nltk
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')

In [1]:
# Load packages for data wrangling:
import numpy as np
import pandas as pd
import os

# Load packages for finetuning classification model and saving it:
from simpletransformers.classification import ClassificationModel

# Scikit-learn:
from sklearn.model_selection import train_test_split

# Classification metrics:
from sklearn.metrics import (accuracy_score, recall_score, precision_score, f1_score,
                            classification_report,confusion_matrix)

# For converting model outpus to raw probabilities:
from scipy.special import softmax

# For data cleaning:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from nltk import pos_tag
stopword = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()
from nltk import sent_tokenize, word_tokenize

# Preprocessing

## Data wrangling

In [2]:
# Loading data with fake news:
fake_df = pd.read_csv(
    "Fake.csv")

# Loading data with true news:
true_df = pd.read_csv(
    "True.csv")

In [3]:
# Inspecting fake data:
fake_df.head(10)

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"
5,Racist Alabama Cops Brutalize Black Boy While...,The number of cases of cops brutalizing and ki...,News,"December 25, 2017"
6,"Fresh Off The Golf Course, Trump Lashes Out A...",Donald Trump spent a good portion of his day a...,News,"December 23, 2017"
7,Trump Said Some INSANELY Racist Stuff Inside ...,In the wake of yet another court decision that...,News,"December 23, 2017"
8,Former CIA Director Slams Trump Over UN Bully...,Many people have raised the alarm regarding th...,News,"December 22, 2017"
9,WATCH: Brand-New Pro-Trump Ad Features So Muc...,Just when you might have thought we d get a br...,News,"December 21, 2017"


In [4]:
# Inspecting true data:
true_df.head(10)

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"
5,"White House, Congress prepare for talks on spe...","WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...",politicsNews,"December 29, 2017"
6,"Trump says Russia probe will be fair, but time...","WEST PALM BEACH, Fla (Reuters) - President Don...",politicsNews,"December 29, 2017"
7,Factbox: Trump on Twitter (Dec 29) - Approval ...,The following statements were posted to the ve...,politicsNews,"December 29, 2017"
8,Trump on Twitter (Dec 28) - Global Warming,The following statements were posted to the ve...,politicsNews,"December 29, 2017"
9,Alabama official to certify Senator-elect Jone...,WASHINGTON (Reuters) - Alabama Secretary of St...,politicsNews,"December 28, 2017"


In [5]:
# Adding category labels to each dataset: 
fake_df["label"]="fake"
true_df["label"]="true"

In [6]:
# Merge fake- and true news into a single dataframe:
merged_df = pd.concat([true_df, fake_df])

In [7]:
# Assessing whether merge was succesful:
len(true_df) + len(fake_df) == len(merged_df)

True

In [8]:
merged_df = merged_df.reset_index()

In [9]:
# Selecting only relevant columns
merged_df = merged_df[["text", "label"]]

## Data cleaning

### Removing bad columns

In [10]:
merged_df.replace(" ", float("NaN"), inplace=True)

merged_df.dropna(subset = ["text"], inplace=True)

In [11]:
# Remove NA's:
merged_df = merged_df.dropna()

In [12]:
merged_df

Unnamed: 0,text,label
0,WASHINGTON (Reuters) - The head of a conservat...,true
1,WASHINGTON (Reuters) - Transgender people will...,true
2,WASHINGTON (Reuters) - The special counsel inv...,true
3,WASHINGTON (Reuters) - Trump campaign adviser ...,true
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,true
...,...,...
44893,21st Century Wire says As 21WIRE reported earl...,fake
44894,21st Century Wire says It s a familiar theme. ...,fake
44895,Patrick Henningsen 21st Century WireRemember ...,fake
44896,21st Century Wire says Al Jazeera America will...,fake


In [13]:
# Remove duplicates:
merged_df = merged_df.drop_duplicates(subset=['text'])

In [14]:
merged_df

Unnamed: 0,text,label
0,WASHINGTON (Reuters) - The head of a conservat...,true
1,WASHINGTON (Reuters) - Transgender people will...,true
2,WASHINGTON (Reuters) - The special counsel inv...,true
3,WASHINGTON (Reuters) - Trump campaign adviser ...,true
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,true
...,...,...
44115,21st Century Wire says All the world s a stage...,fake
44116,Randy Johnson 21st Century WireThe majority ...,fake
44117,Tune in to the Alternate Current Radio Network...,fake
44118,21st Century Wire says A new front has just op...,fake


In [15]:
merged_df = merged_df.reset_index()

In [16]:
# Selecting only relevant columns
merged_df = merged_df[["text", "label"]]

### Regex

#### Remove "[city name] reuters" from true articles

In [17]:
# Define regex patter
pattern = r".*\(Reuters\) - "

for i in range(len(merged_df['text'])):
    merged_df['text'][i] = re.sub(pattern, '', merged_df['text'][i])

In [18]:
merged_df.head(10)

Unnamed: 0,text,label
0,The head of a conservative Republican faction ...,True
1,Transgender people will be allowed for the fir...,True
2,The special counsel investigation of links bet...,True
3,Trump campaign adviser George Papadopoulos tol...,True
4,President Donald Trump called on the U.S. Post...,True
5,The White House said on Friday it was set to k...,True
6,President Donald Trump said on Thursday he bel...,True
7,The following statements were posted to the ve...,True
8,The following statements were posted to the ve...,True
9,Alabama Secretary of State John Merrill said h...,True


#### Remove hashtags

In [19]:
# Define regex patter
pattern = r"#(\S+)"

for i in range(len(merged_df['text'])):
    merged_df['text'][i] = re.sub(pattern, '', merged_df['text'][i])

In [20]:
merged_df.head()

Unnamed: 0,text,label
0,The head of a conservative Republican faction ...,True
1,Transgender people will be allowed for the fir...,True
2,The special counsel investigation of links bet...,True
3,Trump campaign adviser George Papadopoulos tol...,True
4,President Donald Trump called on the U.S. Post...,True


#### Remove twitter tags ("@[username]")

In [21]:
# Define regex patter
pattern = r"@(\S+)"

for i in range(len(merged_df['text'])):
    merged_df['text'][i] = re.sub(pattern, '', merged_df['text'][i])

In [22]:
merged_df.head()

Unnamed: 0,text,label
0,The head of a conservative Republican faction ...,True
1,Transgender people will be allowed for the fir...,True
2,The special counsel investigation of links bet...,True
3,Trump campaign adviser George Papadopoulos tol...,True
4,President Donald Trump called on the U.S. Post...,True


#### Remove '(CAPSLOCK)'

In [23]:
# Define regex patter
pattern = r"\([A-Z]*\)"

for i in range(len(merged_df['text'])):
    merged_df['text'][i] = re.sub(pattern, '', merged_df['text'][i])

#### Remove systematic patterns:

In [24]:
# Define regex patter
pattern = r"The following statement.*accuracy[.]"

for i in range(len(merged_df['text'])):
    merged_df['text'][i] = re.sub(pattern, '', merged_df['text'][i])

In [25]:
# Define regex patter
pattern = r"pic\.twitter\.com\/.* "

for i in range(len(merged_df['text'])):
    merged_df['text'][i] = re.sub(pattern, '', merged_df['text'][i])

#### Remove punctuation

In [26]:
def remove_punctuation(text):
    no_punct=[words for words in text if words not in string.punctuation]
    words_wo_punct=''.join(no_punct)
    return words_wo_punct

In [27]:
merged_df['text']=merged_df['text'].apply(lambda x: remove_punctuation(x))

In [28]:
merged_df.head()

Unnamed: 0,text,label
0,The head of a conservative Republican faction ...,True
1,Transgender people will be allowed for the fir...,True
2,The special counsel investigation of links bet...,True
3,Trump campaign adviser George Papadopoulos tol...,True
4,President Donald Trump called on the US Postal...,True


### Tokenization + Lower

In [29]:
# Tokenize:
def tokenize(text):
    split=re.split("\W+",text) 
    return split

In [30]:
merged_df['tokenized']=merged_df['text'].apply(lambda x: tokenize(x.lower()))

In [31]:
merged_df.head()

Unnamed: 0,text,label,tokenized
0,The head of a conservative Republican faction ...,True,"[the, head, of, a, conservative, republican, f..."
1,Transgender people will be allowed for the fir...,True,"[transgender, people, will, be, allowed, for, ..."
2,The special counsel investigation of links bet...,True,"[the, special, counsel, investigation, of, lin..."
3,Trump campaign adviser George Papadopoulos tol...,True,"[trump, campaign, adviser, george, papadopoulo..."
4,President Donald Trump called on the US Postal...,True,"[president, donald, trump, called, on, the, us..."


### Remove stopwords

In [32]:
def remove_stopwords(text):
    text=[words for words in text if words not in stopword]
    #text=' '.join(text)
    return text

In [33]:
merged_df['tokenized'] = merged_df['tokenized'].apply(lambda x: remove_stopwords(x))

In [34]:
merged_df.head(10)

Unnamed: 0,text,label,tokenized
0,The head of a conservative Republican faction ...,True,"[head, conservative, republican, faction, us, ..."
1,Transgender people will be allowed for the fir...,True,"[transgender, people, allowed, first, time, en..."
2,The special counsel investigation of links bet...,True,"[special, counsel, investigation, links, russi..."
3,Trump campaign adviser George Papadopoulos tol...,True,"[trump, campaign, adviser, george, papadopoulo..."
4,President Donald Trump called on the US Postal...,True,"[president, donald, trump, called, us, postal,..."
5,The White House said on Friday it was set to k...,True,"[white, house, said, friday, set, kick, talks,..."
6,President Donald Trump said on Thursday he bel...,True,"[president, donald, trump, said, thursday, bel..."
7,While the Fake News loves to talk about m...,True,"[, fake, news, loves, talk, socalled, low, app..."
8,Together we are MAKING AMERICA GREAT AGAI...,True,"[, together, making, america, great, bitly2lnp..."
9,Alabama Secretary of State John Merrill said h...,True,"[alabama, secretary, state, john, merrill, sai..."


### Lemmatize

In [35]:
def penn2morphy(penntag):
    """ Converts Penn Treebank tags to WordNet. Copied from kaggle post https://www.kaggle.com/alvations/basic-nlp-with-nltk"""
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n'

In [36]:
for i in range(len(merged_df['tokenized'])):
    tagged = pos_tag(merged_df['tokenized'][i])
    merged_df['tokenized'][i] = [lemmatizer.lemmatize(word, pos=penn2morphy(tag)) for word, tag in tagged]

In [37]:
merged_df.head(10)

Unnamed: 0,text,label,tokenized
0,The head of a conservative Republican faction ...,True,"[head, conservative, republican, faction, u, c..."
1,Transgender people will be allowed for the fir...,True,"[transgender, people, allow, first, time, enli..."
2,The special counsel investigation of links bet...,True,"[special, counsel, investigation, link, russia..."
3,Trump campaign adviser George Papadopoulos tol...,True,"[trump, campaign, adviser, george, papadopoulo..."
4,President Donald Trump called on the US Postal...,True,"[president, donald, trump, call, u, postal, se..."
5,The White House said on Friday it was set to k...,True,"[white, house, say, friday, set, kick, talk, n..."
6,President Donald Trump said on Thursday he bel...,True,"[president, donald, trump, say, thursday, beli..."
7,While the Fake News loves to talk about m...,True,"[, fake, news, love, talk, socalled, low, appr..."
8,Together we are MAKING AMERICA GREAT AGAI...,True,"[, together, make, america, great, bitly2lnpka..."
9,Alabama Secretary of State John Merrill said h...,True,"[alabama, secretary, state, john, merrill, say..."


### Concatenate

In [38]:
def concat(text):
    text=[words for words in text]
    text=' '.join(text)
    return text

In [39]:
merged_df['text'] = merged_df['tokenized'].apply(lambda x: concat(x))

In [40]:
merged_df.head(10)

Unnamed: 0,text,label,tokenized
0,head conservative republican faction u congres...,True,"[head, conservative, republican, faction, u, c..."
1,transgender people allow first time enlist u m...,True,"[transgender, people, allow, first, time, enli..."
2,special counsel investigation link russia pres...,True,"[special, counsel, investigation, link, russia..."
3,trump campaign adviser george papadopoulos tel...,True,"[trump, campaign, adviser, george, papadopoulo..."
4,president donald trump call u postal service f...,True,"[president, donald, trump, call, u, postal, se..."
5,white house say friday set kick talk next week...,True,"[white, house, say, friday, set, kick, talk, n..."
6,president donald trump say thursday believe fa...,True,"[president, donald, trump, say, thursday, beli..."
7,fake news love talk socalled low approval rat...,True,"[, fake, news, love, talk, socalled, low, appr..."
8,together make america great bitly2lnpkaq 1814...,True,"[, together, make, america, great, bitly2lnpka..."
9,alabama secretary state john merrill say certi...,True,"[alabama, secretary, state, john, merrill, say..."


## Saving and loading cleaned data

### Write dataframe to csv-file

In [41]:
# Selecting only relevant columns
merged_df = merged_df[["text", "label"]]

In [46]:
merged_df.to_csv('cleaned_data.csv', index=False)

### Load cleaned data

In [47]:
cleaned_data = pd.read_csv('cleaned_data.csv')

In [48]:
cleaned_data

Unnamed: 0,text,label
0,head conservative republican faction u congres...,true
1,transgender people allow first time enlist u m...,true
2,special counsel investigation link russia pres...,true
3,trump campaign adviser george papadopoulos tel...,true
4,president donald trump call u postal service f...,true
...,...,...
38640,21st century wire say world stage men woman me...,fake
38641,randy johnson 21st century wirethe majority m...,fake
38642,tune alternate current radio network another l...,fake
38643,21st century wire say new front open long batt...,fake


### Merge data and create training- and testing set:

In [None]:
# Create train/test split with 20% of all articles in testing data:
train, test = train_test_split(merged_df, test_size=0.2)

In [None]:
# Convert label column to binary integer (0 = true, 1 = fake):
train["label"] = np.where(train["label"] == "true", 0,1)
test["label"] = np.where(test["label"] == "true", 0,1) 

In [None]:
# Inspecting transformed training data:
train.head(10)

In [None]:
# Inspecting transformed training data:
test.head(10)

In [None]:
# Assess that data is balanced across categories:
train.groupby('label').count()

In [None]:
# Assess that data is balanced across categories:
test.groupby('label').count()

In [None]:
# Create subset (temporary)
#train_sub = train[0:100]

In [None]:
# Create subset (temporary)
#test_sub = test[0:100]

In [None]:
# Define number of unique labels:
n_labels = len(train_sub['label'].unique())

In [None]:
# Create list of texts to predict (change y to X):
y = test['text'].tolist()

# Analysis

## Training

In [None]:
# Initialize the model with the specified hyperparameters:
sent_model = ClassificationModel('bert',"bert-base-uncased",
                                 num_labels=n_labels, use_cuda=False,
                                 args={'reprocess_input_data': True, 'overwrite_output_dir': True,
                                       "num_train_epochs": 1, "max_seq_length": 128, "train_batch_size": 16,
                                       "learning_rate": 1e-5})

# Fine-tune the model:
sent_model.train_model(train_sub)

## Predictions

In [None]:
# Loading trained model, so we don't have to rerun the training each time we restart the kernel.
#sent_model = ClassificationModel("bert", "outputs/", num_labels=n_labels, use_cuda=False)

In [None]:
# Use the fine-tuned model to predict the testing labels and save the raw model outputs:
_, raw_pred = sent_model.predict(y)

In [None]:
# Convert raw model outputs to class probabilities:
probabilities = softmax(raw_pred, axis=1)

In [None]:
# Asssess probabilities:
probabilities

In [None]:
# Binarize probabilities to the most probable class:
binary_preds = [np.argmax(pred) for pred in probabilities] 

In [None]:
# Inspect length of predictions:
len(binary_preds)

# Results

In [None]:
# Print classification report:
print(classification_report(test_sub.label, binary_preds))

# Print confusion matrix:
confusion_matrix(test_sub.label, binary_preds)