In [86]:
import json
import random
import pickle
import statistics as stat
import pandas as pd
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
from sklearn.svm import SVC,LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.base import TransformerMixin
from sklearn.model_selection import cross_val_score,train_test_split,GridSearchCV
from sklearn.metrics import roc_auc_score,confusion_matrix,accuracy_score,f1_score,precision_score,recall_score,precision_recall_fscore_support,classification_report
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import seaborn as sns
%matplotlib inline
stopwords = stopwords.words('english')
import timeit

In [87]:
DATAPATH = "../data/ProtestNews2019/"
stopwords = open("../data/stopwords.txt",'r').read().split('\n\n')

In [88]:
def create_dataframe(list_of_articles):
    data = [[article['id'],article['url'],' '.join(article['text'].split('\n')),article['label']] for article in list_of_articles]
    return pd.DataFrame(data,columns=['id','url','text','label'])

In [89]:
train_data = []
with open(DATAPATH+'all_train_with_org.json', 'rb') as f:
        for line in f:
            d = json.loads(line)
            train_data.append(d)
            
dev_data = []
with open(DATAPATH+'all_dev_with_org.json', 'rb') as f:
        for line in f:
            d = json.loads(line)
            dev_data.append(d)
            
test_data = []
with open(DATAPATH+'all_test_with_org.json', 'rb') as f:
        for line in f:
            d = json.loads(line)
            test_data.append(d)

china_data = []
with open(DATAPATH+'china_test_with_org.json', 'rb') as f:
        for line in f:
            d = json.loads(line)
            china_data.append(d)

## Random Sampling: 30% of Each Articles' Words

In [90]:
def preprocess(news):
    l = WordNetLemmatizer()
    sentences = news.split(".")
    return " ".join([l.lemmatize(word.lower()) for sentence in sentences for word in sentence.split() \
                     if word not in stopwords if word.isalpha() if len(word)> 2 if word.lower() not in\
                     ["said","the","first","also","would","one","two","they"]])

In [91]:
def preprocess_30(news):
    l = WordNetLemmatizer()
    sentences = news.split(".")
    words = " ".join([l.lemmatize(word.lower()) for sentence in sentences for word in sentence.split() \
                     if word not in stopwords if word.isalpha() if len(word)> 2 if word.lower() not in \
                     ["said","the","first","also","would","one","two","they"]])
    
    return " ".join(random.sample(words.split(),round(len(words.split())*0.3)))

In [92]:
df_train = create_dataframe(train_data)
df_dev = create_dataframe(dev_data)
df_test = create_dataframe(test_data)

In [93]:
train = df_train.iloc[:,-2:]
train['text_30'] = train['text'].map(preprocess_30)
train['text'] = train['text'].map(preprocess)

dev = df_dev.iloc[:,-2:]
dev['text_30'] = dev['text'].map(preprocess_30)
dev['text'] = dev['text'].map(preprocess)

test = df_test.iloc[:,-2:]
test['text_30'] = test['text'].map(preprocess_30)
test['text'] = test['text'].map(preprocess)

## GNB

In [94]:
# TfIdf Vectorizer returns a sparse matrix and GaussianNB() takes only dense matrices, so I am using an transformer.
class DenseTransformer(TransformerMixin):
    def fit(self, X, y=None, **fit_params):
        return self
    def transform(self, X, y=None, **fit_params):
        return X.todense()

In [95]:
%%time

gnb_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('to_dense', DenseTransformer()),
    ('clf', GaussianNB()),
])

hyperparameters = dict(
    tfidf__min_df      = (25,50,100),
    tfidf__ngram_range = ((1, 1), (1, 2), (1, 3))
)

gnb_grid_search = GridSearchCV(gnb_pipeline, hyperparameters,cv=3,scoring='f1_macro')

gnb_grid_search.fit(train.text, list(train.label))

gnb_grid_search.best_score_

Wall time: 1min 22s


0.7361981315157461

In [96]:
print("CLASSIFICATION REPORT OF DEV SET:")
print(classification_report(gnb_grid_search.predict(dev.text),dev.label))

CLASSIFICATION REPORT OF DEV SET:
              precision    recall  f1-score   support

         0.0       0.79      0.95      0.87       296
         1.0       0.86      0.55      0.67       161

    accuracy                           0.81       457
   macro avg       0.83      0.75      0.77       457
weighted avg       0.82      0.81      0.80       457



In [97]:
print("CLASSIFICATION REPORT OF TEST SET:")
print(classification_report(gnb_grid_search.predict(test.text),test.label))

CLASSIFICATION REPORT OF TEST SET:
              precision    recall  f1-score   support

         0.0       0.78      0.93      0.85       444
         1.0       0.81      0.51      0.62       243

    accuracy                           0.78       687
   macro avg       0.79      0.72      0.74       687
weighted avg       0.79      0.78      0.77       687



In [98]:
%%time

gnb_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('to_dense', DenseTransformer()),
    ('clf', GaussianNB()),
])

hyperparameters = dict(
    tfidf__min_df      = (25,50,100),
    tfidf__ngram_range = ((1, 1), (1, 2), (1, 3))
)

gnb_grid_search = GridSearchCV(gnb_pipeline, hyperparameters,cv=3,scoring='f1_macro')

gnb_grid_search.fit(train.text_30, list(train.label))

gnb_grid_search.best_score_

Wall time: 23 s


0.6338851828895108

In [99]:
print("CLASSIFICATION REPORT OF DEV SET:")
print(classification_report(gnb_grid_search.predict(dev.text_30),dev.label))

CLASSIFICATION REPORT OF DEV SET:
              precision    recall  f1-score   support

         0.0       0.62      0.94      0.75       235
         1.0       0.85      0.39      0.54       222

    accuracy                           0.67       457
   macro avg       0.74      0.66      0.64       457
weighted avg       0.73      0.67      0.64       457



In [100]:
print("CLASSIFICATION REPORT OF TEST SET:")
print(classification_report(gnb_grid_search.predict(test.text_30),test.label))

CLASSIFICATION REPORT OF TEST SET:
              precision    recall  f1-score   support

         0.0       0.65      0.93      0.77       373
         1.0       0.83      0.41      0.55       314

    accuracy                           0.69       687
   macro avg       0.74      0.67      0.66       687
weighted avg       0.73      0.69      0.67       687



## SVM

In [107]:
%%time 

svc_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', SVC()),
])

hyperparameters = dict(
    tfidf__min_df      = (4, 10, 16),
    tfidf__ngram_range = ((1, 1), (1, 2), (1, 3)),
    clf__kernel        = ["linear","sigmoid"],
    clf__C             = np.logspace(1,3,3)

)

svc_grid_search = GridSearchCV(svc_pipeline, hyperparameters,cv=2,scoring='f1_macro')

svc_grid_search.fit(train.text, list(train.label))

svc_grid_search.best_score_

Wall time: 10min 1s


0.8356848145705001

In [108]:
%%time 

svc_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', SVC()),
])

hyperparameters = dict(
    tfidf__min_df      = (4, 10, 16),
    tfidf__ngram_range = ((1, 1), (1, 2), (1, 3)),
    clf__kernel        = ["linear","sigmoid"],
    clf__C             = np.logspace(1,3,3)

)

svc_grid_search = GridSearchCV(svc_pipeline, hyperparameters,cv=2,scoring='f1_macro')

svc_grid_search.fit(train.text_30, list(train.label))

svc_grid_search.best_score_

Wall time: 3min 3s


0.755141174356247