# Importing Packages

In [1]:
# Load packages for data wrangling:
import os
import glob
import numpy as np
import pandas as pd

# Load packages for fine-tuning BERT model:
from simpletransformers.classification import ClassificationModel

# Load scikit-learn train_test_split:
from sklearn.model_selection import train_test_split

# Load classification metrics:
from sklearn.metrics import (accuracy_score, recall_score, precision_score, f1_score,
                            classification_report,confusion_matrix)

# Load softmax for converting raw model outpus to probabilities:
from scipy.special import softmax

# Load packages for data cleaning:
import string
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk import sent_tokenize, word_tokenize

# Set stopword corpus
stopword = nltk.corpus.stopwords.words('english')

# Set NLTK lemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /home/ucloud/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /home/ucloud/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /home/ucloud/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ucloud/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


# Preprocessing of dataset 1

## Data loading and wrangling

In [2]:
# Loading data with fake news as pandas dataframe:
fake_df = pd.read_csv(os.path.join("data", "dataset_1", "Fake.csv"))

# Loading data with true news as pandas dataframe:
true_df = pd.read_csv(os.path.join("data", "dataset_1", "True.csv"))

In [3]:
# Inspecting fake data:
fake_df.head(10)

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"
5,Racist Alabama Cops Brutalize Black Boy While...,The number of cases of cops brutalizing and ki...,News,"December 25, 2017"
6,"Fresh Off The Golf Course, Trump Lashes Out A...",Donald Trump spent a good portion of his day a...,News,"December 23, 2017"
7,Trump Said Some INSANELY Racist Stuff Inside ...,In the wake of yet another court decision that...,News,"December 23, 2017"
8,Former CIA Director Slams Trump Over UN Bully...,Many people have raised the alarm regarding th...,News,"December 22, 2017"
9,WATCH: Brand-New Pro-Trump Ad Features So Muc...,Just when you might have thought we d get a br...,News,"December 21, 2017"


In [4]:
# Inspecting true data:
true_df.head(10)

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"
5,"White House, Congress prepare for talks on spe...","WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...",politicsNews,"December 29, 2017"
6,"Trump says Russia probe will be fair, but time...","WEST PALM BEACH, Fla (Reuters) - President Don...",politicsNews,"December 29, 2017"
7,Factbox: Trump on Twitter (Dec 29) - Approval ...,The following statements were posted to the ve...,politicsNews,"December 29, 2017"
8,Trump on Twitter (Dec 28) - Global Warming,The following statements were posted to the ve...,politicsNews,"December 29, 2017"
9,Alabama official to certify Senator-elect Jone...,WASHINGTON (Reuters) - Alabama Secretary of St...,politicsNews,"December 28, 2017"


In [5]:
# Adding category-labels to each dataset: 
fake_df["label"]="fake"
true_df["label"]="true"

In [6]:
# Merge fake- and true news into a single dataframe:
merged_df = pd.concat([true_df, fake_df])

In [7]:
# Assessing whether merge was succesful:
len(true_df) + len(fake_df) == len(merged_df)

True

## Data cleaning

### Removing bad rows

In [8]:
# Remove rows with only whitespace and replace it with NaN:
merged_df.replace(" ", float("NaN"), inplace=True)

# Remove NA's:
merged_df.dropna(subset = ["text"], inplace=True)

In [9]:
# Remove duplicate texts:
merged_df = merged_df.drop_duplicates(subset=['text'])

In [10]:
# Reset indices:
merged_df = merged_df.reset_index()

In [11]:
# Selecting only relevant columns:
merged_df = merged_df[["text", "label"]]

### Regex

#### Remove "[city name] Reuters - " from true articles

In [12]:
# Define regex pattern:
pattern = r".*\(Reuters\) - "

for i in range(len(merged_df['text'])):
    merged_df['text'][i] = re.sub(pattern, '', merged_df['text'][i])

#### Remove hashtags

In [13]:
# Define regex pattern:
pattern = r"#(\S+)"

for i in range(len(merged_df['text'])):
    merged_df['text'][i] = re.sub(pattern, '', merged_df['text'][i])

#### Remove twitter tags ("@[username]")

In [14]:
# Define regex pattern:
pattern = r"@(\S+)"

for i in range(len(merged_df['text'])):
    merged_df['text'][i] = re.sub(pattern, '', merged_df['text'][i])

#### Remove '(CAPSLOCK)' e.g. from (VIDEO); something which was quite frequent in the fake news dataset

In [15]:
# Define regex pattern:
pattern = r"\([A-Z]*\)"

for i in range(len(merged_df['text'])):
    merged_df['text'][i] = re.sub(pattern, '', merged_df['text'][i])

#### Remove systematic patterns:

In [16]:
# Define regex pattern:
pattern = r"The following statement.*accuracy[.]"

for i in range(len(merged_df['text'])):
    merged_df['text'][i] = re.sub(pattern, '', merged_df['text'][i])

In [17]:
# Define regex pattern:
pattern = r"pic\.twitter\.com\/.* "

for i in range(len(merged_df['text'])):
    merged_df['text'][i] = re.sub(pattern, '', merged_df['text'][i])

### Remove punctuation

In [18]:
# Define function:
def remove_punctuation(text):
    no_punct=[words for words in text if words not in string.punctuation]
    words_wo_punct=''.join(no_punct)
    return words_wo_punct

In [19]:
merged_df['text']=merged_df['text'].apply(lambda x: remove_punctuation(x))

### Tokenization + Lower

In [20]:
# Define function:
def tokenize(text):
    split=re.split("\W+",text) 
    return split

In [21]:
merged_df['tokenized']=merged_df['text'].apply(lambda x: tokenize(x.lower()))

### Remove stopwords

In [22]:
# Define function:
def remove_stopwords(text):
    text=[words for words in text if words not in stopword]
    #text=' '.join(text)
    return text

In [23]:
merged_df['tokenized'] = merged_df['tokenized'].apply(lambda x: remove_stopwords(x))

### Lemmatize

In [24]:
# Define function:
def penn2morphy(penntag):
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n'

In [25]:
for i in range(len(merged_df['tokenized'])):
    tagged = pos_tag(merged_df['tokenized'][i])
    merged_df['tokenized'][i] = [lemmatizer.lemmatize(word, pos=penn2morphy(tag)) for word, tag in tagged]

### Concatenate tokens into sentences

In [26]:
# Define function:
def concat(text):
    text=[words for words in text]
    text=' '.join(text)
    return text

In [27]:
merged_df['text'] = merged_df['tokenized'].apply(lambda x: concat(x))

### Remove newly induced empty columns

In [28]:
merged_df.replace(" ", float("NaN"), inplace=True)

merged_df.dropna(subset = ["text"], inplace=True)

In [29]:
merged_df = merged_df.reset_index()

### Assess whether we have missed anything

In [30]:
true_idx = merged_df[merged_df['label']=="true"].index.tolist()
fake_idx = merged_df[merged_df['label']=="fake"].index.tolist()

In [31]:
from collections import Counter
Counter(" ".join(merged_df['text'][true_idx]).split()).most_common(10)

[('say', 113426),
 ('trump', 53621),
 ('u', 40552),
 ('state', 36143),
 ('would', 31145),
 ('president', 26582),
 ('republican', 20154),
 ('government', 19171),
 ('year', 18520),
 ('house', 16787)]

In [32]:
from collections import Counter
Counter(" ".join(merged_df['text'][fake_idx]).split()).most_common(10)

[('trump', 58413),
 ('say', 36515),
 ('people', 19204),
 ('president', 18091),
 ('go', 17802),
 ('would', 17078),
 ('make', 16956),
 ('one', 16919),
 ('state', 16195),
 ('get', 14812)]

### Remove newly found systematic patterns

In [33]:
# Define regex pattern:
pattern = r"21st century wire say"

for i in range(len(merged_df['text'])):
    merged_df['text'][i] = re.sub(pattern, '', merged_df['text'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [34]:
# Define regex pattern:
pattern = r"21st century wire"

for i in range(len(merged_df['text'])):
    merged_df['text'][i] = re.sub(pattern, '', merged_df['text'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [35]:
# Define regex pattern:
pattern = r"filessupport.*"

for i in range(len(merged_df['text'])):
    merged_df['text'][i] = re.sub(pattern, '', merged_df['text'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [36]:
# Define regex pattern:
pattern = r"21wire"

for i in range(len(merged_df['text'])):
    merged_df['text'][i] = re.sub(pattern, '', merged_df['text'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [37]:
from collections import Counter
Counter(" ".join(merged_df['text'][fake_idx]).split()).most_common(10)

[('trump', 58413),
 ('say', 36151),
 ('people', 19204),
 ('president', 18091),
 ('go', 17802),
 ('would', 17078),
 ('make', 16956),
 ('one', 16919),
 ('state', 16195),
 ('get', 14812)]

## Saving and loading cleaned dataset 1

### Write dataframe to csv-file

In [38]:
# Selecting only relevant columns
merged_df = merged_df[["text", "label"]]

In [39]:
# Write to csv
merged_df.to_csv(os.path.join("data", "generated_data", "cleaned_dataset_1.csv"), index=False)

# BERT trained- and evaluated on dataset 1

### Load cleaned data and prepare for classification

In [40]:
cleaned_dataset_1 = pd.read_csv(os.path.join("data", "generated_data", "cleaned_dataset_1.csv"))

__One row is corrupted when loading CSV and is turned into blank space. This is removed__

In [41]:
cleaned_dataset_1.replace(" ", float("NaN"), inplace=True)

cleaned_dataset_1.dropna(subset = ["text"], inplace=True)

__Create training-, validiation and testing dataset:__

In [42]:
# Create train/test split with 20% of all articles in testing data:
train_1, test_1 = train_test_split(cleaned_dataset_1, test_size=0.2)

In [43]:
# Create train/val split with 10% of remaining articles in validation data:
train_1, val_1 = train_test_split(train_1, test_size=0.1)

In [44]:
# Assess that split was successful:
len(train_1) + len(val_1) + len(test_1) == len(cleaned_dataset_1)

True

In [45]:
# Convert label column to binary integer (0 = true, 1 = fake):
train_1["label"] = np.where(train_1["label"] == "true", 0,1)
val_1["label"] = np.where(val_1["label"] == "true", 0,1)
test_1["label"] = np.where(test_1["label"] == "true", 0,1) 

In [46]:
# Inspecting transformed training data:
train_1.head(10)

Unnamed: 0,text,label
16115,kenya president uhuru kenyatta 96 percent vote...,0
28437,hillary clinton step fight senate republican r...,1
12600,britain withdrawal agreement european union mu...,0
16940,british prime minister theresa may yet set dat...,0
25804,libertarian convince presidential ticket get s...,1
10152,proposal raise california minimum wage 15 hour...,0
11197,u president barack obama tuesday pledge undert...,0
9683,u senate thursday confirm president barack oba...,0
12964,philippine president rodrigo duterte order pol...,0
21575,employee work north carolina mcdonald capture ...,1


In [47]:
# Inspecting transformed validation data:
val_1.head(10)

Unnamed: 0,text,label
17223,protege outgo prorussian leader almazbek atamb...,0
32165,dc antifa leader move turkey man meet surprise...,1
13078,republican u senator lindsey graham sunday urg...,0
20803,u president donald trump agree principle scrap...,0
26117,democratic member congressional black caucus c...,1
35077,far left publication like democracy global res...,1
33992,harry reid disrespectful comment another examp...,1
20158,british lawmaker tuesday vote favor government...,0
33287,one presidentelect donald trump potential cabi...,1
26223,speech monday donald trump propose absurd idea...,1


In [49]:
# Assess that data is roughly balanced across categories:
train_1.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
0,15302
1,12520


In [50]:
# Assess that data is rpughly balanced across categories:
val_1.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
0,1702
1,1390


In [51]:
# Assess that data is roughly balanced across categories:
test_1.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
0,4187
1,3542


In [52]:
# Define number of unique labels:
n_labels = len(train_1['label'].unique())

In [53]:
# Create list of texts to predict:
X_dataset_1 = test_1['text'].tolist()

In [54]:
# Inspect length
len(X_dataset_1)

7729

## Training

In [19]:
# Initialize the model with the specified hyperparameters:
FN_model_1 = ClassificationModel('bert',"bert-base-uncased",
                                 num_labels=n_labels, use_cuda=False,
                                 args={'reprocess_input_data': True, 'overwrite_output_dir': True,
                                       "num_train_epochs": 3, "max_seq_length": 512, "train_batch_size": 128,
                                       "learning_rate": 1e-5})

# Fine-tune the model:
FN_model_1.train_model(train_1)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

  0%|          | 0/27822 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/218 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/218 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/218 [00:00<?, ?it/s]

(654, 0.06579963080759448)

## Predictions

In [55]:
# Loading trained model, so we don't have to rerun the training each time we restart the kernel:
FN_model_1 = ClassificationModel("bert", "outputs_dataset_1/", num_labels=n_labels, use_cuda=False)

In [56]:
# Use the fine-tuned model to predict the testing labels and save the raw model outputs:
_, raw_pred = FN_model_1.predict(X_dataset_1)

  0%|          | 0/7729 [00:00<?, ?it/s]

  0%|          | 0/967 [00:00<?, ?it/s]

In [57]:
# Convert raw model outputs to class probabilities:
probabilities = softmax(raw_pred, axis=1)

In [58]:
# Asssess probabilities:
probabilities

array([[4.79282272e-04, 9.99520718e-01],
       [9.99563380e-01, 4.36620433e-04],
       [4.79260344e-04, 9.99520740e-01],
       ...,
       [3.38563774e-03, 9.96614362e-01],
       [5.35239229e-04, 9.99464761e-01],
       [9.99326263e-01, 6.73736558e-04]])

In [59]:
# Binarize probabilities to the most probable class:
binary_preds = [np.argmax(pred) for pred in probabilities] 

In [60]:
# Inspect length of predictions:
len(binary_preds)

7729

## Results

In [61]:
# Print classification report:
print(classification_report(test_1.label, binary_preds))

# Print confusion matrix:
confusion_matrix(test_1.label, binary_preds)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4187
           1       1.00      1.00      1.00      3542

    accuracy                           1.00      7729
   macro avg       1.00      1.00      1.00      7729
weighted avg       1.00      1.00      1.00      7729



array([[4182,    5],
       [  13, 3529]])

# Preprocess dataset 2

## Data loading and wrangling

In [62]:
file_list = glob.glob(os.path.join(os.getcwd(), "data", "dataset_2", "fake", "*.txt"))

fake = []

for file_path in file_list:
    with open(file_path, encoding='windows-1252') as f_input:
        encoded_f = f_input.read().replace("\n", " ")
        fake.append(encoded_f)

In [63]:
file_list = glob.glob(os.path.join(os.getcwd(), "data", "dataset_2", "real", "*.txt"))

real = []

for file_path in file_list:
    with open(file_path, encoding='windows-1252') as f_input:
        encoded_f = f_input.read().replace("\n", " ")
        real.append(encoded_f)

## Data cleaning

In [64]:
# Remove \ from the data:
for i in range(len(fake)):
    fake[i] = fake[i].replace("\'", "")

In [65]:
# Remove \ from the data:
for i in range(len(real)):
    real[i] = real[i].replace("\'", "")

In [66]:
# Convert data to pandas dataframe:
fake_new = pd.DataFrame(fake)

In [67]:
# Rename column with texts to text:
fake_new = fake_new.rename({0: "text"},axis = 'columns')

In [68]:
# Add label-column with fake labels:
fake_new['label'] = 'fake'

In [69]:
# Convert data to pandas dataframe:
real_new = pd.DataFrame(real)

In [70]:
# Rename column with texts to text:
real_new = real_new.rename({0: "text"},axis = 'columns')

In [71]:
# Add label-column with fake labels:
real_new['label'] = 'true'

In [72]:
# Merge fake- and true news into a single dataframe:
merged_new = pd.concat([fake_new, real_new])

In [73]:
# Reset indeces:
merged_new = merged_new.reset_index()

In [74]:
# Selecting only relevant columns:
merged_new = merged_new[["text", "label"]]

In [75]:
# Inspecting:
merged_new 

Unnamed: 0,text,label
0,The warranty on ‘Make America Great Again’ bas...,fake
1,"Calling it a total disaster, president-elect D...",fake
2,"WASHINGTON, D.C. – Former presidential inter...",fake
3,President Barack Obama’s legacy might soon be ...,fake
4,"atican City – In a final speech to the synod, ...",fake
...,...,...
246,WASHINGTON — Republicans are united on repeali...,true
247,President-elect Donald Trump escalated his rhe...,true
248,Congress is preparing to do major battle next ...,true
249,"PALM BEACH, Fla. -- President-elect Donald Tru...",true


### Remove punctuation

In [76]:
merged_new['text']=merged_new['text'].apply(lambda x: remove_punctuation(x))

### Tokenize and lower

In [77]:
merged_new['tokenized']=merged_new['text'].apply(lambda x: tokenize(x.lower()))

### Remove stopwords

In [78]:
merged_new['tokenized'] = merged_new['tokenized'].apply(lambda x: remove_stopwords(x))

### Lemma

In [79]:
for i in range(len(merged_new['tokenized'])):
    tagged = pos_tag(merged_new['tokenized'][i])
    merged_new['tokenized'][i] = [lemmatizer.lemmatize(word, pos=penn2morphy(tag)) for word, tag in tagged]

### Concatenate tokens into sentences

In [80]:
merged_new['text'] = merged_new['tokenized'].apply(lambda x: concat(x))

### Write dataframe to csv-file

In [81]:
# Selecting only relevant columns:
merged_new = merged_new[["text", "label"]]

In [82]:
# Write to csv:
merged_new.to_csv(os.path.join("data", "generated_data", "cleaned_dataset_2.csv"), index=False)

# BERT trained on dataset 1, evaluated on dataset 2


### Load cleaned data

In [83]:
cleaned_dataset_2 = pd.read_csv(os.path.join("data", "generated_data", "cleaned_dataset_2.csv"))

In [84]:
# Change labels to binary integers:
cleaned_dataset_2["label"] = np.where(cleaned_dataset_2["label"] == "true", 0,1)

In [85]:
# Define number of unique labels:
n_labels = len(cleaned_dataset_2['label'].unique())

In [86]:
# Create list of texts to predict:
X_dataset_2 = cleaned_dataset_2['text'].tolist()

In [87]:
# Use the 1st fine-tuned model to predict dataset 2 save the raw model outputs:
_, raw_pred = FN_model_1.predict(X_dataset_2)

  0%|          | 0/251 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

In [88]:
# Convert raw model outputs to class probabilities:
probabilities = softmax(raw_pred, axis=1)

In [89]:
# Binarize probabilities to the most probable class:
binary_preds = [np.argmax(pred) for pred in probabilities] 

In [90]:
# Inspect length of predictions:
len(binary_preds)

251

In [91]:
# Print classification report:
print(classification_report(cleaned_dataset_2.label, binary_preds))

# Print confusion matrix:
confusion_matrix(cleaned_dataset_2.label, binary_preds)

              precision    recall  f1-score   support

           0       0.59      0.62      0.61       128
           1       0.58      0.56      0.57       123

    accuracy                           0.59       251
   macro avg       0.59      0.59      0.59       251
weighted avg       0.59      0.59      0.59       251



array([[79, 49],
       [54, 69]])

# BERT trained- and evaluated dataset 2:

In [92]:
# Create train/test split with 20% of all articles in testing data:
train_2, test_2 = train_test_split(cleaned_dataset_2, test_size=0.2)

In [93]:
# Create list of texts to predict:
X_dataset_2 = test_2['text'].tolist()

In [94]:
# Define number of unique labels:
n_labels = len(train_2['label'].unique())

In [115]:
# Initialize the model with the specified hyperparameters:
FN_model_2 = ClassificationModel('bert',"bert-base-uncased",
                                 num_labels=n_labels, use_cuda=False,
                                 args={'reprocess_input_data': True, 'overwrite_output_dir': True,
                                       "num_train_epochs": 3, "max_seq_length": 512, "train_batch_size": 16,
                                       "learning_rate": 1e-5})

# Fine-tune the model:
FN_model_2.train_model(train_2)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/200 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/13 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/13 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/13 [00:00<?, ?it/s]

(39, 0.6268747601753626)

In [95]:
# Loading trained model, so we don't have to rerun the training each time we restart the kernel:
FN_model_2 = ClassificationModel("bert", "outputs_dataset_2/", num_labels=n_labels, use_cuda=False)

In [96]:
# Use the fine-tuned model to predict the testing labels and save the raw model outputs:
_, raw_pred = FN_model_2.predict(X_dataset_2)

  0%|          | 0/51 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

In [97]:
# Convert raw model outputs to class probabilities:
probabilities = softmax(raw_pred, axis=1)

In [98]:
# Binarize probabilities to the most probable class:
binary_preds = [np.argmax(pred) for pred in probabilities] 

In [99]:
# Inspect length of predictions:
len(binary_preds)

51

In [100]:
# Print classification report:
print(classification_report(test_2.label, binary_preds))

# Print confusion matrix:
confusion_matrix(test_2.label, binary_preds)

              precision    recall  f1-score   support

           0       0.86      0.72      0.78        25
           1       0.77      0.88      0.82        26

    accuracy                           0.80        51
   macro avg       0.81      0.80      0.80        51
weighted avg       0.81      0.80      0.80        51



array([[18,  7],
       [ 3, 23]])

# BERT trained on dataset 2, evaluated on dataset 1

In [101]:
# Use the fine-tuned model to predict the testing labels from dataset 1 and save the raw model outputs:
_, raw_pred = FN_model_2.predict(X_dataset_1)

  0%|          | 0/7729 [00:00<?, ?it/s]

  0%|          | 0/967 [00:00<?, ?it/s]

In [102]:
# Convert raw model outputs to class probabilities:
probabilities = softmax(raw_pred, axis=1)

In [103]:
# Binarize probabilities to the most probable class:
binary_preds = [np.argmax(pred) for pred in probabilities] 

In [104]:
# Inspect length of predictions:
len(binary_preds)

7729

In [105]:
# Print classification report:
print(classification_report(test_1.label, binary_preds))

# Print confusion matrix:
confusion_matrix(test_1.label, binary_preds)

              precision    recall  f1-score   support

           0       0.67      0.30      0.41      4187
           1       0.50      0.83      0.62      3542

    accuracy                           0.54      7729
   macro avg       0.58      0.56      0.52      7729
weighted avg       0.59      0.54      0.51      7729



array([[1237, 2950],
       [ 618, 2924]])

# Periods - for temporal word embedding analysis

## Data wrangling

In [106]:
import numpy as np
import regex as re
from datetime import *

In [107]:
# Load data:
fake = pd.read_csv(os.path.join("data", "dataset_1", "Fake.csv"))

# NA for wrong entries:
fake["date"] = [re.sub("^.*:.*|^.* .* .* .*|^\d.*", "NA", date) for date in fake["date"]] # All webpages, entries that start with a number and sequences of words upon words upon words should be NA

# Drop rows with NAs:
fake = fake[(fake!='NA').all(1)]

# Streamline dates:
months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
for i in months:
    fake["date"] = [re.sub(f"^{i}", f"{i[0:3]}", date) for date in fake["date"]]

# Convert to date format:
fake["date"] = [datetime.strptime(date, "%b %d, %Y").date() for date in fake["date"]]

In [108]:
# Find date range:
date_range = max(fake["date"]) - min(fake["date"])

In [109]:
# Create categorical variable pertaining to split:
period = []
for date in fake["date"]:
    if date <= min(fake["date"]) + date_range/5:
        period.append(1)
    if date > min(fake["date"]) + date_range/5 and date <= min(fake["date"]) + date_range/5*2:
        period.append(2)
    if date > min(fake["date"]) + date_range/5*2 and date <= min(fake["date"]) + date_range/5*3:
        period.append(3)
    if date > min(fake["date"]) + date_range/5*3 and date <= min(fake["date"]) + date_range/5*4:
        period.append(4)
    if date > min(fake["date"]) + date_range/5*4:
        period.append(5)

In [110]:
# Create column with periods:
fake["period"] = period

In [111]:
# Ensure that the unique entries in the period-column is correct:
fake['period'].unique()

array([5, 4, 3, 2, 1])

In [112]:
# Write data to csv:
fake.to_csv(os.path.join("data", "generated_data", "fake_periods.csv"), index=False)

In [113]:
# Load data from csv:
fake = pd.read_csv(os.path.join("data", "generated_data", "fake_periods.csv"))

## Data cleaning

In [114]:
# Remove rows with only whitespace and replace it with NA's
fake.replace(" ", float("NaN"), inplace=True)

# Remove NA's
fake.dropna(subset = ["text"], inplace=True)

In [115]:
# Remove duplicate texts:
fake = fake.drop_duplicates(subset=['text'])

In [116]:
# Reset indeces:
fake = fake.reset_index()

### Remove reuters

In [117]:
# Define regex pattern:
pattern = r".*\(Reuters\) - "

for i in range(len(fake['text'])):
    fake['text'][i] = re.sub(pattern, '', fake['text'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


### Remove hashtags

In [118]:
# Define regex pattern:
pattern = r"#(\S+)"

for i in range(len(fake['text'])):
    fake['text'][i] = re.sub(pattern, '', fake['text'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


### Remove tags

In [119]:
# Define regex pattern:
pattern = r"@(\S+)"

for i in range(len(fake['text'])):
    fake['text'][i] = re.sub(pattern, '', fake['text'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


### Remove (capslock)

In [120]:
# Define regex pattern:
pattern = r"\([A-Z]*\)"

for i in range(len(fake['text'])):
    fake['text'][i] = re.sub(pattern, '', fake['text'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


### Remove systematic patterns

In [122]:
# Define regex pattern:
pattern = r"The following statement.*accuracy[.]"

for i in range(len(fake['text'])):
    fake['text'][i] = re.sub(pattern, '', fake['text'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [123]:
# Define regex pattern:
pattern = r"pic\.twitter\.com\/.* "

for i in range(len(fake['text'])):
    fake['text'][i] = re.sub(pattern, '', fake['text'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


### Remove punctuation:

In [124]:
fake['text']=fake['text'].apply(lambda x: remove_punctuation(x))

### Tokenize and lower

In [125]:
fake['tokenized']=fake['text'].apply(lambda x: tokenize(x.lower()))

### Remove stopwords

In [126]:
fake['tokenized'] = fake['tokenized'].apply(lambda x: remove_stopwords(x))

### Lemmatize

In [127]:
for i in range(len(fake['tokenized'])):
    tagged = pos_tag(fake['tokenized'][i])
    fake['tokenized'][i] = [lemmatizer.lemmatize(word, pos=penn2morphy(tag)) for word, tag in tagged]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


### Concatenate words

In [129]:
fake['text'] = fake['tokenized'].apply(lambda x: concat(x))

### Remove newly induced empty columns

In [130]:
fake.replace(" ", float("NaN"), inplace=True)

fake.dropna(subset = ["text"], inplace=True)

In [131]:
fake = fake.reset_index()

### Remove more systematic patterns

In [132]:
# Define regex pattern:
pattern = r"21st century wire say"

for i in range(len(fake['text'])):
    fake['text'][i] = re.sub(pattern, '', fake['text'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [133]:
# Define regex pattern:
pattern = r"21st century wire"

for i in range(len(fake['text'])):
    fake['text'][i] = re.sub(pattern, '', fake['text'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [134]:
# Define regex pattern:
pattern = r"filessupport.*"

for i in range(len(fake['text'])):
    fake['text'][i] = re.sub(pattern, '', fake['text'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [135]:
# Define regex pattern:
pattern = r"21wire"

for i in range(len(fake['text'])):
    fake['text'][i] = re.sub(pattern, '', fake['text'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


## Write data for word-embedding analysis

In [136]:
# Concatenate
period_texts = []
for i in range(1, 6):
    period_text = " ".join(fake.loc[fake['period'] == i]["text"])
    period_texts.append(period_text)
    
# Write as .txt files
for i, n in zip(period_texts, range(1,6)):
    text_file = open(os.path.join("word_embeddings", "output", "texts", f"00{n}0.txt"), "w")
    n = text_file.write(i)
    text_file.close()