In [1]:
import pandas as pd
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
from sklearn.svm import SVC,LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score,train_test_split,GridSearchCV
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score,precision_score,recall_score,precision_recall_fscore_support,classification_report
from sklearn.decomposition import PCA

stopwords = stopwords.words('english')

In [2]:
# getting the extracted news as df
data = pd.read_csv('extracted_data.csv').drop('Unnamed: 0', axis=1)
data.columns = ['url','section','location','headline','text','label']


# getting the extracted test news as df
data_test = pd.read_csv('extracted_test_data.csv').drop('Unnamed: 0', axis=1)
data_test.columns = ['url','section','location','headline','text','label']

In [3]:
# merging headline and content as another column and create another dataframe with jusst text and label.
df = pd.DataFrame([data.headline+". "+data.text,data.label]).transpose()
df.columns = ['text','label']


# merging headline and content as another column and create another dataframe with jusst text and label.
df_test = pd.DataFrame([data_test.headline+". "+data_test.text,data_test.label]).transpose()
df_test.columns = ['text','label']

In [4]:
# preprocessing function 
    # split an article into sentences
        # go to each sentence and split it to words
            # if this word  is not in stopwords or other common words I've decided
                #AND
                   # if its alphabetic (getting rid of puctuation and numbers)
                        #AND
                            # if len of the word is greater than 2
                            
                            # lemmatize and lowercase the the word
                            
                            # return the cleaned article
def preprocess(news):
    l = WordNetLemmatizer()
    sentences = news.split(".")
    return " ".join([l.lemmatize(word.lower()) for sentence in sentences for word in sentence.split() if word not in stopwords if word.isalpha() if len(word)> 2 if word.lower() not in ["said","the","first","also","would","one","two","they"]])

In [5]:
# apply preprocess() function to each article
df['text'] = df['text'].map(preprocess)



# apply preprocess() function to each article
df_test['text'] = df_test['text'].map(preprocess)

In [8]:
# getting protest and not_protest news
protest_news = df[df.label == 1].text
not_protest_news = df[df.label == 0].text

In [9]:
# bayes theorem. 
    # finding the probability for being protest or not protest for an article which includes given word
def word_affect(word):
    number_of_occurance_in_protest_list = 0
    for article in protest_news:
        for w in article.split():
            if w == word:
                number_of_occurance_in_protest_list +=1
                break 
                
    number_of_occurance_in_not_protest_list = 0
    for article in not_protest_news:
        for w in article.split():
            if w == word:
                number_of_occurance_in_not_protest_list +=1
                break       

    
    pi1 = len(protest_news)/(len(protest_news)+len(not_protest_news))
    pi2 = len(not_protest_news)/(len(protest_news)+len(not_protest_news))
    fkx = number_of_occurance_in_protest_list/len(protest_news)
    fkx_ = number_of_occurance_in_not_protest_list/len(not_protest_news)
    prob = (pi1*fkx)/((pi1*fkx)+(pi2*fkx_))
    
    
    #returns (probability of protest, probability of not protest)
        # this is might seem counterintutitive with word freqs but we must remember the ratio of news
    return(prob,1-prob)

In [10]:
word_affect("protest")

(0.7532467532467533, 0.24675324675324672)

In [11]:
articles = df.text
labels = df.label


test_articles =  df_test.text
test_labels = df_test.label

In [90]:
vectorizer = CountVectorizer(min_df= 10, ngram_range=(1, 1))

In [91]:
tfidf_articles = vectorizer.fit_transform(articles).toarray().astype('float64')

In [92]:
tfidf_articles.shape

(581, 1227)

In [112]:
protest_target_words = []
non_protest_target_words = []
for word in list(vectorizer.vocabulary_.keys()):
    if word_affect(word)[0]-word_affect(word)[1] > 0.2:
        protest_target_words.append((word,(word_affect(word)[0],word_affect(word)[1])))
        
    elif word_affect(word)[1]-word_affect(word)[0] > 0.3 :
        non_protest_target_words.append((word,(word_affect(word)[0],word_affect(word)[1])))
        
        
len(protest_target_words),len(non_protest_target_words)

(41, 943)

In [113]:
protest_target_words

[('gathering', (0.7272727272727273, 0.2727272727272727)),
 ('protest', (0.7532467532467533, 0.24675324675324672)),
 ('arrest', (0.6956521739130435, 0.30434782608695654)),
 ('deployed', (0.7999999999999999, 0.20000000000000007)),
 ('killed', (0.7272727272727273, 0.2727272727272727)),
 ('rally', (0.7999999999999999, 0.20000000000000007)),
 ('incident', (0.6785714285714286, 0.3214285714285714)),
 ('tdp', (0.6875, 0.3125)),
 ('demanding', (0.7659574468085106, 0.23404255319148937)),
 ('staged', (0.8333333333333334, 0.16666666666666663)),
 ('tried', (0.6470588235294118, 0.3529411764705882)),
 ('protester', (0.9333333333333332, 0.06666666666666676)),
 ('opposing', (0.7, 0.30000000000000004)),
 ('clash', (0.75, 0.25)),
 ('allegation', (0.7999999999999999, 0.20000000000000007)),
 ('injury', (0.7, 0.30000000000000004)),
 ('reached', (0.7142857142857143, 0.2857142857142857)),
 ('magistrate', (0.6153846153846154, 0.3846153846153846)),
 ('seeking', (0.611111111111111, 0.38888888888888895)),
 ('acti

In [126]:
results = []
for article in articles:
    if len(set(article.split()))-len(set(article.split())-set([i[0] for i in protest_target_words]))>1:
        results.append(1)
    else:
        results.append(0)

In [127]:
print(classification_report(results,list(labels)))

              precision    recall  f1-score   support

           0       0.89      0.97      0.93       399
           1       0.92      0.74      0.82       182

    accuracy                           0.90       581
   macro avg       0.91      0.86      0.88       581
weighted avg       0.90      0.90      0.90       581



In [128]:
confusion_matrix(results,list(labels))

array([[388,  11],
       [ 47, 135]], dtype=int64)

In [129]:
results = []
for article in test_articles:
    if len(set(article.split()))-len(set(article.split())-set([i[0] for i in protest_target_words]))>1:
        results.append(1)
    else:
        results.append(0)

In [130]:
print(classification_report(results,list(test_labels)))

              precision    recall  f1-score   support

           0       0.92      0.96      0.94       139
           1       0.88      0.80      0.83        54

    accuracy                           0.91       193
   macro avg       0.90      0.88      0.89       193
weighted avg       0.91      0.91      0.91       193



In [131]:
confusion_matrix(results,list(test_labels))

array([[133,   6],
       [ 11,  43]], dtype=int64)

In [132]:
WORDS = [i[0] for i in protest_target_words]

In [135]:
vectors=[]
for article in articles:
    article_words = list(set(article.split()))
    
    article_vector = []
    for word in WORDS:
        if word in article_words:
            article_vector.append(1)
        else:
            article_vector.append(0)
    vectors.append(article_vector)
    

In [None]:
t_vectors=[]
for article in test_articles:
    article_words = list(set(article.split()))
    
    article_vector = []
    for word in WORDS:
        if word in article_words:
            article_vector.append(1)
        else:
            article_vector.append(0)
    t_vectors.append(article_vector)

In [155]:
print(classification_report(SVC().fit(pd.DataFrame(vectors),list(labels)).predict(pd.DataFrame(t_vectors)),list(test_labels)))

              precision    recall  f1-score   support

           0       0.94      0.92      0.93       148
           1       0.76      0.82      0.79        45

    accuracy                           0.90       193
   macro avg       0.85      0.87      0.86       193
weighted avg       0.90      0.90      0.90       193



In [157]:
svc_model = SVC()

hyperparameters= dict(kernel=["linear", "poly","sigmoid"],
                      C=np.logspace(0, 4, 5), 
                      class_weight=[{1:1,0:1},{1:2,0:1},{1:3,0:1},{1:3,0:0.5},{1:4,0:0.5},{1:4,0:1}])

svc_grid = GridSearchCV(svc_model, hyperparameters, cv=5,scoring='f1_macro')

best_svc = svc_grid.fit(pd.DataFrame(vectors),list(labels))

best_svc.best_score_

0.8552187194886715

In [158]:
best_svc.best_params_

{'C': 100.0, 'class_weight': {1: 2, 0: 1}, 'kernel': 'linear'}

In [159]:
print(classification_report(best_svc.predict(pd.DataFrame(t_vectors)),list(test_labels)))

              precision    recall  f1-score   support

           0       0.94      0.94      0.94       144
           1       0.82      0.82      0.82        49

    accuracy                           0.91       193
   macro avg       0.88      0.88      0.88       193
weighted avg       0.91      0.91      0.91       193



In [161]:
logistic_model = LogisticRegression()


hyperparameters = dict(C= np.logspace(0, 4, 10),
                       class_weight = [{1:1,0:1},{1:2,0:1},{1:3,0:1},{1:3,0:0.5},{1:4,0:0.5},{1:4,0:1}])

logistic_grid = GridSearchCV(logistic_model, hyperparameters, cv=5,scoring='f1_macro')

best_logistic = logistic_grid.fit(pd.DataFrame(vectors),list(labels))

best_logistic.best_score_

0.8513533597414323

In [162]:
best_logistic.best_params_

{'C': 1.0, 'class_weight': {1: 4, 0: 1}}

In [163]:
print(classification_report(best_logistic.predict(pd.DataFrame(t_vectors)),list(test_labels)))

              precision    recall  f1-score   support

           0       0.94      0.95      0.94       142
           1       0.86      0.82      0.84        51

    accuracy                           0.92       193
   macro avg       0.90      0.89      0.89       193
weighted avg       0.92      0.92      0.92       193



# NN Experiments

In [164]:
import torch
import torch.tensor as tensor
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix

In [171]:
x_train = torch.from_numpy(pd.DataFrame(vectors).to_numpy()).float()
y_train = torch.from_numpy(np.array(list(labels))).float().view(len(labels),1)

x_test = torch.from_numpy(pd.DataFrame(t_vectors).to_numpy()).float()
y_test = torch.from_numpy(np.array(list(test_labels))).float().view(len(test_labels),1)

In [174]:
x_train.shape,y_train.shape,x_train.dtype,y_train.dtype

(torch.Size([581, 41]), torch.Size([581, 1]), torch.float32, torch.float32)

In [209]:
import random
shuffled_idx = [i for i in range(len(y_train))]
random.shuffle(shuffled_idx)

batch_size = 581

batches = []
for i in range(0,len(y_train),batch_size):

    indices= [shuffled_idx[i:i+batch_size]]

    batches.append([x_train[indices],y_train[indices]])

In [210]:
batches[0][0].shape,batches[0][1].shape

(torch.Size([581, 41]), torch.Size([581, 1]))

In [252]:
class ProtestClassifier(nn.Module):
    def __init__(self):
        super(ProtestClassifier,self).__init__()
        
        self.layer1 = nn.Linear(x_train.shape[1],64)
        self.drop1 = nn.Dropout(p=0.2)
        self.layer2 = nn.Linear(64,1)
        self.drop2 = nn.Dropout(p=0.2)
        self.layer3= nn.Linear(64,32)
        self.drop3 = nn.Dropout(p=0.2)
        self.layer4= nn.Linear(32,1)
        
    def forward(self,x):
        
        x = self.layer1(x)
        x = torch.relu(x)
        x = self.drop1(x)
        x = self.layer2(x)
        x = torch.sigmoid(x)

        return x
        
    def predict(self,x):
        pred = self.forward(x)
        ans = []
        for t in pred:
            if t[0]>0.500001:
                ans.append(1)
            else:
                ans.append(0)
        return torch.tensor(ans)

In [253]:
#Initialize the model
model = ProtestClassifier()
#Define loss criterion
criterion = nn.BCELoss()
#Define the optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

In [267]:
#Number of epochs
epochs = 70
#List to store losses
trn_losses = []
eval_losses = []
for i in range(1,epochs):
    for batch in batches:
        x = batch[0]
        y = batch[1]
        
        y_pred = model.forward(x)
        loss = criterion(y_pred,y)    

      
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if i % 10 == 0:
        evalloss = criterion(model.forward(x_test),y_test)
        print(i,'\t',"Training Loss:",round(loss.item(),4),"Eval Loss:",round(evalloss.item(),4))
        trn_losses.append(round(loss.item(),4))
        eval_losses.append(round(evalloss.item(),4))

evalloss = criterion(model.forward(x_test),y_test)
print("\n\nTraining Loss:",round(loss.item(),4),"Eval Loss:",round(evalloss.item(),4))

10 	 Training Loss: 0.214 Eval Loss: 0.2444
20 	 Training Loss: 0.2051 Eval Loss: 0.2511
30 	 Training Loss: 0.2124 Eval Loss: 0.2539
40 	 Training Loss: 0.2057 Eval Loss: 0.2472
50 	 Training Loss: 0.2166 Eval Loss: 0.2335
60 	 Training Loss: 0.2049 Eval Loss: 0.2412


Training Loss: 0.2139 Eval Loss: 0.2572


In [265]:
print(classification_report(model.predict(x_test),y_test))

              precision    recall  f1-score   support

           0       0.96      0.91      0.94       151
           1       0.73      0.86      0.79        42

    accuracy                           0.90       193
   macro avg       0.85      0.89      0.86       193
weighted avg       0.91      0.90      0.90       193

