# Data Filtering
In this part, we filter the data in order to obtain the quotes extracted from American news.

In [41]:
#Import useful libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
!pip install tld
!pip install pandas pyarrow
from tld import get_tld
import json
import bz2
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
from tqdm import tqdm
# Disable some pandas warnings we don't need
pd.options.mode.chained_assignment = None









The data is loaded here.

In [42]:
with bz2.open("data/quotes-cropped-with-country.json.bz2", 'rb') as s_file: 
    df_original = pd.read_json(s_file, lines=True)
df_original.head()

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase,country
0,2015-11-17-031545,"However, that had nothing to do with the blast...",Dawood Ibrahim,[Q553191],2015-11-17 13:32:58,3,"[[Dawood Ibrahim, 0.6072], [None, 0.338], [San...",[http://www.newindianexpress.com/nation/Part-o...,E,IN
1,2015-12-29-000719,91 percent of suspected terrorists who attempt...,Patrick Murphy,"[Q17418821, Q21170773, Q2224935, Q23418906, Q3...",2015-12-29 15:42:57,4,"[[Patrick Murphy, 0.8398], [None, 0.1579], [Je...",[http://politifact.com/florida/statements/2015...,E,US
2,2015-06-15-000921,a powerful and compelling motive to lie about ...,,[],2015-06-15 21:39:12,1,"[[None, 0.7388], [Joseph Thompson, 0.2313], [A...",[http://dailylocal.com/general-news/20150613/n...,E,
3,2015-12-08-003771,After the failure of the U.S. Congress to pass...,Roy Cooper,"[Q16106910, Q7372694, Q7372695]",2015-12-08 16:15:05,1,"[[Roy Cooper, 0.928], [None, 0.0574], [Frank P...",[http://digtriad.com/story/news/2015/12/08/sho...,E,
4,2015-01-09-024195,I got back inside and then they started shooti...,Herman Torres,[Q18285204],2015-01-09 03:39:35,1,"[[Herman Torres, 0.9218], [None, 0.0782]]",[http://www.wesh.com/news/orange-county-sherif...,E,US


Only US data is selected.

In [43]:
df_us=df_original[df_original['country']=='US']
df_us.head()

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase,country
1,2015-12-29-000719,91 percent of suspected terrorists who attempt...,Patrick Murphy,"[Q17418821, Q21170773, Q2224935, Q23418906, Q3...",2015-12-29 15:42:57,4,"[[Patrick Murphy, 0.8398], [None, 0.1579], [Je...",[http://politifact.com/florida/statements/2015...,E,US
4,2015-01-09-024195,I got back inside and then they started shooti...,Herman Torres,[Q18285204],2015-01-09 03:39:35,1,"[[Herman Torres, 0.9218], [None, 0.0782]]",[http://www.wesh.com/news/orange-county-sherif...,E,US
20,2015-11-26-034388,I'm letting him try to find it for himself. He...,Byron Scott,[Q538009],2015-11-26 18:51:56,3,"[[Byron Scott, 0.8244], [None, 0.1685], [Kobe ...",[http://www.latimes.com/sports/lakers/la-sp-la...,E,US
24,2015-07-30-026870,He described a shooting that looked nothing li...,Lawrence Middleton,[Q16091535],2015-07-30 05:00:00,12,"[[Lawrence Middleton, 0.8756], [None, 0.1233],...",[http://feeds.latimes.com/~r/features/books/~3...,E,US
26,2015-08-21-026312,"He was not armed, he did not have a gun, our i...",,[],2015-08-21 20:54:00,2,"[[None, 0.8635], [Sam Dotson, 0.1133], [Michae...",[http://www.latimes.com/nation/la-na-black-mis...,E,US


We select only quotes from US national news.

In [44]:
def not_international(xs):
    for x in xs:
        if not 'international' in x:
            return True
    return False
    
index=df_us.apply(lambda x : not_international(x['urls']),axis=1 )
df_us=df_us[index]
df_us.head()

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase,country
1,2015-12-29-000719,91 percent of suspected terrorists who attempt...,Patrick Murphy,"[Q17418821, Q21170773, Q2224935, Q23418906, Q3...",2015-12-29 15:42:57,4,"[[Patrick Murphy, 0.8398], [None, 0.1579], [Je...",[http://politifact.com/florida/statements/2015...,E,US
4,2015-01-09-024195,I got back inside and then they started shooti...,Herman Torres,[Q18285204],2015-01-09 03:39:35,1,"[[Herman Torres, 0.9218], [None, 0.0782]]",[http://www.wesh.com/news/orange-county-sherif...,E,US
20,2015-11-26-034388,I'm letting him try to find it for himself. He...,Byron Scott,[Q538009],2015-11-26 18:51:56,3,"[[Byron Scott, 0.8244], [None, 0.1685], [Kobe ...",[http://www.latimes.com/sports/lakers/la-sp-la...,E,US
24,2015-07-30-026870,He described a shooting that looked nothing li...,Lawrence Middleton,[Q16091535],2015-07-30 05:00:00,12,"[[Lawrence Middleton, 0.8756], [None, 0.1233],...",[http://feeds.latimes.com/~r/features/books/~3...,E,US
26,2015-08-21-026312,"He was not armed, he did not have a gun, our i...",,[],2015-08-21 20:54:00,2,"[[None, 0.8635], [Sam Dotson, 0.1133], [Michae...",[http://www.latimes.com/nation/la-na-black-mis...,E,US


The 'quotation' column is extracted, and we assign a default target -1 to each quote.
The target value depends on the correlation between the quotation and our context: the value 1 indicates that the quote is related to discussions about mass shooting or gun controls, 0 otherwise.

In [45]:
quotations=pd.DataFrame(df_us['quotation'])
quotations['target']=-1
quotations.head()

Unnamed: 0,quotation,target
1,91 percent of suspected terrorists who attempt...,-1
4,I got back inside and then they started shooti...,-1
20,I'm letting him try to find it for himself. He...,-1
24,He described a shooting that looked nothing li...,-1
26,"He was not armed, he did not have a gun, our i...",-1


The file is saved. It will be used in the prediction phase.

In [28]:
quotations.to_csv('data/us_quotations.csv',index=False)

# Natural Language Processing
In this phase, we train the Bert model to predict the quotations' target.

In [1]:
import random
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.utils import shuffle
from transformers import AdamW
import torch
from tqdm.auto import tqdm
from torch.nn.utils.rnn import pad_sequence

The DataLoader class is created to iteratively group the data into batches.

In [26]:
class DataLoader:
    def __init__(self, df, batch_size, device):
        labels = torch.tensor(df.label.to_list()).to(device)
        seq = pad_sequence(df.tensor.to_list(), batch_first=True).to(device)
        mask = torch.zeros_like(seq)  # Attention mask, so we don't infer on padding
        for i, j in enumerate(df.tensor):
            mask[i, 0:len(j)] = 1

        self.data = [(seq[b:b + batch_size],
                       mask[b:b + batch_size],
                       labels[b:b + batch_size]) for b in range(0, len(df), batch_size)]

    def __iter__(self):
        for b in self.data:
            yield b

    def __len__(self):
        return len(self.data)

We define the function to compute the accuracy rate.

In [33]:
def compute_accuracy(model, batch):
    model.eval()
    count = 0
    acc = 0
    for seq, mask, labels in tqdm(batch):
        pred = model(seq, attention_mask=mask)[0].argmax(axis=1)
        acc += (pred == labels).sum().item()
        count += len(labels)
    accuracy = acc / count
    print('accuracy:', accuracy)

In order to perform supervised learning, we manually classify and label 1500 quotes. Our training and validation sets are based on them.
As we can see from the summary, 1016 quotes are labelled with target 1 and 484 quotes have got target 0.

In [27]:
df = pd.read_csv('Data/us_quotations.csv')
df = df[df['label'] != -1]
print((df['label'] == 1).sum())
print((df['label'] == 0).sum())
df.head()

1016
484


Unnamed: 0,quotation,label
15,This is going to happen outside the gun indust...,1
97,It's all passion and desire while on location ...,0
167,NRA goes to court over local gun laws in Pa.,1
269,The facts of this tragedy remain under investi...,1
352,Was it your intention... to kill Mr. Torres wi...,1


Each quotation is tokenized with BertTokenizer.

In [28]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
df['tensor'] = df.quotation.apply(lambda x: torch.tensor(tokenizer.encode(x),
                                                             dtype=torch.long))
df['length'] = df.tensor.apply(len)
df = df[['label', 'tensor', 'length']]
df.head()

Unnamed: 0,label,tensor,length
15,1,"[tensor(101), tensor(1188), tensor(1110), tens...",29
97,0,"[tensor(101), tensor(1135), tensor(112), tenso...",56
167,1,"[tensor(101), tensor(151), tensor(9664), tenso...",14
269,1,"[tensor(101), tensor(1109), tensor(9193), tens...",65
352,1,"[tensor(101), tensor(3982), tensor(1122), tens...",18


The data is shuffled. Then, we take 1000 quotes into the training set, while the 500 remaining quotes compose the validation set.

In [29]:
df = shuffle(df)
train_data = df[:1000]
eval_data = df[1000:]
train_batch = DataLoader(train_data, 4, 'cuda')
eval_batch = DataLoader(eval_data, 4, 'cuda')

In [None]:
A pre-trained Bert model is loaded here. We choose to use AdamW optimizer with learning rate 1e-5.

In [30]:
model = BertForSequenceClassification.from_pretrained('bert-base-cased').to('cuda')
optimizer = AdamW(model.parameters(), lr=1e-5)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Now the model is trained and the accuracy rate is shown.

In [36]:
for epoch in range(1):
    acc_loss = 0
    print('Epoch:', epoch)
    model.train()
    for i, (seq, mask, labels) in enumerate(tqdm(train_batch)):
        optimizer.zero_grad()
        loss, pred = model(seq, attention_mask=mask, labels=labels)[:2]
        acc_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()
    compute_accuracy(model, eval_batch)

Epoch: 0


HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=125.0), HTML(value='')))


accuracy: 0.966


In [37]:
model.save_pretrained('bert_model')

The prediction is made on each quotation in our dataset, through the trained model.

In [39]:
df = pd.read_csv('Data/us_quotations.csv')
print('processing data')
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
df['tensor'] = df.quotation.apply(lambda x: torch.tensor(tokenizer.encode(x[:512]),
                                                         dtype=torch.long))

test_batch = DataLoader(df, 1, 'cuda')

model = BertForSequenceClassification.from_pretrained('bert_model').to('cuda')
predictions = []
model.eval()
print('start prediction')
for seq, mask, labels in tqdm(test_batch):
    pred = model(seq, attention_mask=mask)[0].argmax(axis=1)
    predictions.append(pred.item())

df['label'] = predictions
df[['quotation', 'label']].to_csv('data/us_predicted_quotations.csv', index=False)

processing data
start prediction


HBox(children=(FloatProgress(value=0.0, max=107457.0), HTML(value='')))




After the prediction phase, the assigned targets are merged to the original data.

In [49]:
df_us['label']=predictions
df_us.head()

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase,country,label
1,2015-12-29-000719,91 percent of suspected terrorists who attempt...,Patrick Murphy,"[Q17418821, Q21170773, Q2224935, Q23418906, Q3...",2015-12-29 15:42:57,4,"[[Patrick Murphy, 0.8398], [None, 0.1579], [Je...",[http://politifact.com/florida/statements/2015...,E,US,1
4,2015-01-09-024195,I got back inside and then they started shooti...,Herman Torres,[Q18285204],2015-01-09 03:39:35,1,"[[Herman Torres, 0.9218], [None, 0.0782]]",[http://www.wesh.com/news/orange-county-sherif...,E,US,1
20,2015-11-26-034388,I'm letting him try to find it for himself. He...,Byron Scott,[Q538009],2015-11-26 18:51:56,3,"[[Byron Scott, 0.8244], [None, 0.1685], [Kobe ...",[http://www.latimes.com/sports/lakers/la-sp-la...,E,US,0
24,2015-07-30-026870,He described a shooting that looked nothing li...,Lawrence Middleton,[Q16091535],2015-07-30 05:00:00,12,"[[Lawrence Middleton, 0.8756], [None, 0.1233],...",[http://feeds.latimes.com/~r/features/books/~3...,E,US,1
26,2015-08-21-026312,"He was not armed, he did not have a gun, our i...",,[],2015-08-21 20:54:00,2,"[[None, 0.8635], [Sam Dotson, 0.1133], [Michae...",[http://www.latimes.com/nation/la-na-black-mis...,E,US,1


We can notice that our dataset contains about 80 thousands of quotes related to discussions about shootings and gun controls, while the other ones are not related to our topics.

In [50]:
print((df_us['label']==1).sum())
print((df_us['label']==0).sum())

79158
28299


In [None]:
Some examples of related and unrelated quotes are printed.

In [55]:
df_us[df_us['label']==1]['quotation'].head()

1     91 percent of suspected terrorists who attempt...
4     I got back inside and then they started shooti...
24    He described a shooting that looked nothing li...
26    He was not armed, he did not have a gun, our i...
33    Law abiding Americans owning guns is not the p...
Name: quotation, dtype: object

In [57]:
df_us[df_us['label']==0]['quotation'].head()

20    I'm letting him try to find it for himself. He...
27    I didn't think of that at all. I should have s...
35    Maybe a lot of people don't know this, but Sea...
36    Measles vaccine in modified form also effectiv...
69    Studio licensing practices means it often take...
Name: quotation, dtype: object


The unrelated quotes are removed from the dataset.

In [58]:
df_us=df_us[df_us['label']==1]
df_us.head()

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase,country,label
1,2015-12-29-000719,91 percent of suspected terrorists who attempt...,Patrick Murphy,"[Q17418821, Q21170773, Q2224935, Q23418906, Q3...",2015-12-29 15:42:57,4,"[[Patrick Murphy, 0.8398], [None, 0.1579], [Je...",[http://politifact.com/florida/statements/2015...,E,US,1
4,2015-01-09-024195,I got back inside and then they started shooti...,Herman Torres,[Q18285204],2015-01-09 03:39:35,1,"[[Herman Torres, 0.9218], [None, 0.0782]]",[http://www.wesh.com/news/orange-county-sherif...,E,US,1
24,2015-07-30-026870,He described a shooting that looked nothing li...,Lawrence Middleton,[Q16091535],2015-07-30 05:00:00,12,"[[Lawrence Middleton, 0.8756], [None, 0.1233],...",[http://feeds.latimes.com/~r/features/books/~3...,E,US,1
26,2015-08-21-026312,"He was not armed, he did not have a gun, our i...",,[],2015-08-21 20:54:00,2,"[[None, 0.8635], [Sam Dotson, 0.1133], [Michae...",[http://www.latimes.com/nation/la-na-black-mis...,E,US,1
33,2015-12-07-058906,Law abiding Americans owning guns is not the p...,Ralph Peters,[Q7287957],2015-12-07 20:31:06,1,"[[Ralph Peters, 0.8094], [Stuart Varney, 0.142...",[http://www.ibtimes.co.uk/obama-such-total-py-...,E,US,1


In [60]:
df_us.drop('label',1).to_csv('data/us_quotes.csv',index=False)

  df_us.drop('label',1).to_csv('data/us_quotes.csv',index=False)
