In [1]:
import pandas as pd
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
from sklearn.svm import SVC,LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score,train_test_split,GridSearchCV
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score,precision_score,recall_score,precision_recall_fscore_support,classification_report
from sklearn.decomposition import PCA

stopwords = stopwords.words('english')

In [2]:
test = pd.read_csv('../data/test.csv')

In [11]:
test = test.iloc[:,-1:]

test.head(4)

Unnamed: 0,text
0,The US Supreme Court Monday refused to hear an...
1,Police are still searching for an East Europea...
2,Two sheep thieves in NWest court Molaole Monts...
3,A teenager and her younger brother were killed...


In [64]:
# getting the extracted news as df
data = pd.read_csv('../data/extracted_data.csv').drop('Unnamed: 0', axis=1)
data.columns = ['url','section','location','headline','text','label']


# getting the extracted test news as df
data_test = pd.read_csv('../data/extracted_test_data.csv').drop('Unnamed: 0', axis=1)
data_test.columns = ['url','section','location','headline','text','label']

In [65]:
# merging headline and content as another column and create another dataframe with jusst text and label.
df = pd.DataFrame([data.headline+". "+data.text,data.label]).transpose()
df.columns = ['text','label']


# merging headline and content as another column and create another dataframe with jusst text and label.
df_test = pd.DataFrame([data_test.headline+". "+data_test.text,data_test.label]).transpose()
df_test.columns = ['text','label']

In [66]:
# preprocessing function 
    # split an article into sentences
        # go to each sentence and split it to words
            # if this word  is not in stopwords or other common words I've decided
                #AND
                   # if its alphabetic (getting rid of puctuation and numbers)
                        #AND
                            # if len of the word is greater than 2
                            
                            # lemmatize and lowercase the the word
                            
                            # return the cleaned article
def preprocess(news):
    l = WordNetLemmatizer()
    sentences = news.split(".")
    return " ".join([l.lemmatize(word.lower()) for sentence in sentences for word in sentence.split() if word not in stopwords if word.isalpha() if len(word)> 2 if word.lower() not in ["said","the","first","also","would","one","two","they"]])

In [67]:
test['text'] =  test['text'].map(preprocess) 

# apply preprocess() function to each article
df['text'] = df['text'].map(preprocess)



# apply preprocess() function to each article
df_test['text'] = df_test['text'].map(preprocess)

df = pd.concat([df,df_test]).reset_index().drop('index',axis=1)

df.head(4)

Unnamed: 0,text,label
0,reader mail ruling alimony supreme court exten...,0
1,niqaab elephant say judge person may give evid...,0
2,grandparent honoured staff student global publ...,0
3,fisherman call strike country boat fisherman l...,1


In [68]:
# getting protest and not_protest news
protest_news = df[df.label == 1].text
not_protest_news = df[df.label == 0].text

In [69]:
# bayes theorem. 
    # finding the probability for being protest or not protest for an article which includes given word
def word_affect(word):
    number_of_occurance_in_protest_list = 0
    for article in protest_news:
        for w in article.split():
            if w == word:
                number_of_occurance_in_protest_list +=1
                break 
                
    number_of_occurance_in_not_protest_list = 0
    for article in not_protest_news:
        for w in article.split():
            if w == word:
                number_of_occurance_in_not_protest_list +=1
                break       

    
    pi1 = len(protest_news)/(len(protest_news)+len(not_protest_news))
    pi2 = len(not_protest_news)/(len(protest_news)+len(not_protest_news))
    fkx = number_of_occurance_in_protest_list/len(protest_news)
    fkx_ = number_of_occurance_in_not_protest_list/len(not_protest_news)
    prob = (pi1*fkx)/((pi1*fkx)+(pi2*fkx_))
    
    
    #returns (probability of protest, probability of not protest)
        # this is might seem counterintutitive with word freqs but we must remember the ratio of news
    return(prob,1-prob)

In [70]:
word_affect("protest")

(0.801980198019802, 0.19801980198019797)

In [98]:
articles = df.text
labels = df.label

train_articles,test_articles,train_label,test_label = train_test_split(articles,labels,test_size = 0.3, shuffle=True)
test_art =  test.text
#test_labels = df_test.label

In [100]:
vectorizer = CountVectorizer(min_df= 5, ngram_range=(1, 1))

In [101]:
tfidf_articles = vectorizer.fit_transform(train_articles).toarray().astype('float64')

In [102]:
tfidf_articles.shape

(541, 2249)

In [103]:
protest_target_words = []
non_protest_target_words = []
for word in list(vectorizer.vocabulary_.keys()):
    if word_affect(word)[0]-word_affect(word)[1] > 0.4:
        protest_target_words.append((word,(word_affect(word)[0],word_affect(word)[1])))
        
    elif word_affect(word)[1]-word_affect(word)[0] > 0.3 :
        non_protest_target_words.append((word,(word_affect(word)[0],word_affect(word)[1])))
        
        
len(protest_target_words),len(non_protest_target_words)

(56, 1742)

In [104]:
protest_target_words

[('policeman', (0.7333333333333334, 0.2666666666666666)),
 ('staged', (0.8275862068965517, 0.1724137931034483)),
 ('protest', (0.801980198019802, 0.19801980198019797)),
 ('demonstration', (0.9259259259259258, 0.07407407407407418)),
 ('protester', (0.9375, 0.0625)),
 ('mlas', (0.7999999999999999, 0.20000000000000007)),
 ('dharna', (0.9, 0.09999999999999998)),
 ('collectorate', (0.7692307692307693, 0.23076923076923073)),
 ('agitation', (0.7272727272727272, 0.2727272727272728)),
 ('activist', (0.7954545454545454, 0.20454545454545459)),
 ('mob', (0.75, 0.25)),
 ('rally', (0.8095238095238095, 0.19047619047619047)),
 ('demanding', (0.7580645161290324, 0.24193548387096764)),
 ('tense', (0.875, 0.125)),
 ('banner', (0.7777777777777777, 0.22222222222222232)),
 ('custody', (0.7272727272727273, 0.2727272727272727)),
 ('attack', (0.7222222222222223, 0.2777777777777777)),
 ('cbi', (0.7777777777777777, 0.22222222222222232)),
 ('suspect', (0.9, 0.09999999999999998)),
 ('communal', (0.7999999999999999

In [105]:
results = []
for article in test_articles:
    if len(set(article.split()))-len(set(article.split())-set([i[0] for i in protest_target_words]))>1:
        results.append(1)
    else:
        results.append(0)

In [107]:
print(classification_report(results,list(test_label)))

              precision    recall  f1-score   support

           0       0.93      0.92      0.93       169
           1       0.80      0.83      0.82        64

    accuracy                           0.90       233
   macro avg       0.87      0.88      0.87       233
weighted avg       0.90      0.90      0.90       233



In [108]:
confusion_matrix(results,list(test_label))

array([[156,  13],
       [ 11,  53]], dtype=int64)

In [109]:
WORDS = [i[0] for i in protest_target_words]

In [110]:
vectors=[]
for article in train_articles:
    article_words = list(set(article.split()))
    
    article_vector = []
    for word in WORDS:
        if word in article_words:
            article_vector.append(1)
        else:
            article_vector.append(0)
    vectors.append(article_vector)
    

In [111]:
t_vectors=[]
for article in test_articles:
    article_words = list(set(article.split()))
    
    article_vector = []
    for word in WORDS:
        if word in article_words:
            article_vector.append(1)
        else:
            article_vector.append(0)
    t_vectors.append(article_vector)

In [116]:
print(classification_report(SVC().fit(pd.DataFrame(vectors),list(train_label)).predict(pd.DataFrame(t_vectors)),list(test_label)))

              precision    recall  f1-score   support

           0       0.94      0.89      0.91       177
           1       0.70      0.82      0.75        56

    accuracy                           0.87       233
   macro avg       0.82      0.85      0.83       233
weighted avg       0.88      0.87      0.87       233



In [121]:
svc_model = SVC()

hyperparameters= dict(kernel=["linear", "poly","sigmoid"],
                      C=np.logspace(0, 4, 5),
                      gamma=('scale', 'auto'))
                      #class_weight=[{1:1,0:1},{1:2,0:1},{1:3,0:1},{1:3,0:0.5},{1:4,0:0.5},{1:4,0:1}])

svc_grid = GridSearchCV(svc_model, hyperparameters, cv=5,scoring='f1_macro')

best_svc = svc_grid.fit(pd.DataFrame(vectors),list(train_label))

best_svc.best_score_

0.8466181382732735

In [122]:
best_svc.best_params_

{'C': 10.0, 'gamma': 'scale', 'kernel': 'linear'}

In [123]:
print(classification_report(best_svc.predict(pd.DataFrame(t_vectors)),list(test_label)))

              precision    recall  f1-score   support

           0       0.95      0.83      0.89       190
           1       0.52      0.79      0.62        43

    accuracy                           0.82       233
   macro avg       0.73      0.81      0.75       233
weighted avg       0.87      0.82      0.84       233



In [126]:
logistic_model = LogisticRegression()


hyperparameters = dict(C= np.logspace(0, 4, 5))
                       #class_weight = [{1:1,0:1},{1:2,0:1},{1:3,0:1},{1:3,0:0.5},{1:4,0:0.5},{1:4,0:1}])

logistic_grid = GridSearchCV(logistic_model, hyperparameters, cv=3,scoring='f1_macro')

best_logistic = logistic_grid.fit(pd.DataFrame(vectors),list(train_label))

best_logistic.best_score_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

0.8542121887500106

In [127]:
best_logistic.best_params_

{'C': 10.0}

In [128]:
print(classification_report(best_logistic.predict(pd.DataFrame(t_vectors)),list(test_label)))

              precision    recall  f1-score   support

           0       0.96      0.84      0.90       190
           1       0.55      0.84      0.66        43

    accuracy                           0.84       233
   macro avg       0.75      0.84      0.78       233
weighted avg       0.88      0.84      0.85       233



# NN Experiments

In [129]:
import torch
import torch.tensor as tensor
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix

In [131]:
x_train = torch.from_numpy(pd.DataFrame(vectors).to_numpy()).float()
y_train = torch.from_numpy(np.array(list(train_label))).float().view(len(train_label),1)

x_test = torch.from_numpy(pd.DataFrame(t_vectors).to_numpy()).float()
y_test = torch.from_numpy(np.array(list(test_label))).float().view(len(test_label),1)

In [132]:
x_train.shape,y_train.shape,x_train.dtype,y_train.dtype

(torch.Size([541, 56]), torch.Size([541, 1]), torch.float32, torch.float32)

In [133]:
import random
shuffled_idx = [i for i in range(len(y_train))]
random.shuffle(shuffled_idx)

batch_size = 581

batches = []
for i in range(0,len(y_train),batch_size):

    indices= [shuffled_idx[i:i+batch_size]]

    batches.append([x_train[indices],y_train[indices]])

In [134]:
batches[0][0].shape,batches[0][1].shape

(torch.Size([541, 56]), torch.Size([541, 1]))

In [135]:
class ProtestClassifier(nn.Module):
    def __init__(self):
        super(ProtestClassifier,self).__init__()
        
        self.layer1 = nn.Linear(x_train.shape[1],64)
        self.drop1 = nn.Dropout(p=0.2)
        self.layer2 = nn.Linear(64,1)
        self.drop2 = nn.Dropout(p=0.2)
        self.layer3= nn.Linear(64,32)
        self.drop3 = nn.Dropout(p=0.2)
        self.layer4= nn.Linear(32,1)
        
    def forward(self,x):
        
        x = self.layer1(x)
        x = torch.relu(x)
        x = self.drop1(x)
        x = self.layer2(x)
        x = torch.sigmoid(x)

        return x
        
    def predict(self,x):
        pred = self.forward(x)
        ans = []
        for t in pred:
            if t[0]>0.500001:
                ans.append(1)
            else:
                ans.append(0)
        return torch.tensor(ans)

In [165]:
#Initialize the model
model = ProtestClassifier()
#Define loss criterion
criterion = nn.BCELoss()
#Define the optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

In [166]:
#Number of epochs
epochs = 630
#List to store losses
trn_losses = []
eval_losses = []
for i in range(1,epochs):
    for batch in batches:
        x = batch[0]
        y = batch[1]
        
        y_pred = model.forward(x)
        loss = criterion(y_pred,y)    

      
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if i % 70 == 0:
        evalloss = criterion(model.forward(x_test),y_test)
        print(i,'\t',"Training Loss:",round(loss.item(),4),"Eval Loss:",round(evalloss.item(),4))
        trn_losses.append(round(loss.item(),4))
        eval_losses.append(round(evalloss.item(),4))

evalloss = criterion(model.forward(x_test),y_test)
print("\n\nTraining Loss:",round(loss.item(),4),"Eval Loss:",round(evalloss.item(),4))

70 	 Training Loss: 0.5047 Eval Loss: 0.5563
140 	 Training Loss: 0.3978 Eval Loss: 0.475
210 	 Training Loss: 0.3053 Eval Loss: 0.4061
280 	 Training Loss: 0.2635 Eval Loss: 0.3672
350 	 Training Loss: 0.2427 Eval Loss: 0.3573
420 	 Training Loss: 0.2286 Eval Loss: 0.3553
490 	 Training Loss: 0.2258 Eval Loss: 0.3411
560 	 Training Loss: 0.2134 Eval Loss: 0.3482


Training Loss: 0.2079 Eval Loss: 0.3468


In [167]:
print(classification_report(model.predict(x_test),y_test))

              precision    recall  f1-score   support

           0       0.98      0.85      0.91       193
           1       0.56      0.93      0.70        40

    accuracy                           0.86       233
   macro avg       0.77      0.89      0.80       233
weighted avg       0.91      0.86      0.87       233

