### SemEval2019 Hyperpartisan News Detection
#### Extract features derived from sentiment and bias lexicons, and writing style

In [2]:
from lxml.etree import iterparse
import xml

import os
import numpy as np
import nltk
import pickle
import collections

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import cross_val_score

from utils import *
from text_featurizer import *
from readability import Readability

In [3]:
def readFiles(textFile, labelFile):
    X,y = [], []
    
    with open(labelFile) as labelFile:
        xml.sax.parse(labelFile, GroundTruthHandler(y))
       
    for event, elem in iterparse(textFile):
        if elem.tag == "article":
            title = elem.attrib['title']
            text = "".join(elem.itertext())
            text = textCleaning(title, text) 
            X.append(text)
            elem.clear()
            
    return np.asarray(X), np.asarray(y)

In [4]:
# set path for data
dataPath = 'data/'
textFile = dataPath + 'articles-training-byarticle.xml'
labelFile = dataPath + "ground-truth-training-byarticle.xml"

texts, labels = readFiles(textFile, labelFile)

# split the samples with the same seed to compare results with other methods
id1, id2 = fixedTestSplit(labels)

In [5]:
def switcher(pos):
    '''
    Map POS tags to coarse categories
    '''
    pos_family = {'NN': 'noun', 'NNS': 'noun', 'NNP': 'noun', 'NNPS': 'noun',
    'VB': 'verb', 'VBD': 'verb', 'VBG': 'verb', 'VBN': 'verb', 'VBP': 'verb', 'VBZ': 'verb',
    'JJ': 'adj', 'JJR': 'adj', 'JJS': 'adj',
    'PRP': 'pron', 'PRP$': 'pron', 'WP': 'pron', 'WP$': 'pron',
    'RB': 'adverb', 'RBR': 'adverb', 'RBS': 'adverb'}

    if pos in pos_family.keys():
        return pos_family[pos]
    else:
        return 'others'
    
def extractPOS(pos_tags):
    '''
    Extract normalized POS counts
    '''
    pos_dict = collections.OrderedDict({'noun':0, 'verb':0, 'adj':0, 'pron':0, 'adverb':0, 'others':0})
    for t in pos_tags:
        pos = switcher(t[1])
        pos_dict[pos] = pos_dict[pos] + 1
    
    feat = [val/len(pos_tags) for i,val in pos_dict.items() if i is not 'other']
        
    return feat

In [6]:
def extractReadabilityScores(text):
    rd = Readability(text)
    return [rd.ARI(), rd.FleschReadingEase(), rd.ColemanLiauIndex(), rd.FleschKincaidGradeLevel(), rd.LIX(), rd.RIX()]

In [7]:
def textVectorize(texts):
    '''
    Extract features: # quotations, POS tags count, bias word counts, sentiment word counts, subjective word counts
    readability scores, and writing styles. 
    '''
    features = []
    for text in texts:
        sentences = nltk.sent_tokenize(text)
        pos_tags = nltk.pos_tag(nltk.word_tokenize(text))
        tokens = nltk.word_tokenize(text)
        words = [t.lower() for t in tokens if t.isalpha()]
    
        quotation = len(re.findall('"', text))
        text_feats = extract_text_features(sentences, words)    
        pos_feats = extractPOS(pos_tags)
        bias_feat = bias_lexicon(words)
        sub_feat = subjective_lexicon(pos_tags)
        sent_feats = mpqa_sentiment(words)
        read = extractReadabilityScores(text)
        feat = [text_feats, pos_feats, bias_feat, sub_feat, sent_feats, [quotation], read]
        flattened = [val for sublist in feat for val in sublist]
        
        features.append(flattened)
    return np.asarray(features)

In [8]:
features = textVectorize(texts)
# delete features that are not representated
delete = np.where(np.sum(features, 0) == 0)[0]
feat_select = np.delete(features, delete,1)

In [9]:
# save the features to be appended to other features
np.save("features", feat_select)