# CS4248 Project - Labelled Unreliable News (LUN)

## Imports

In [1]:
import nltk
import re
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import string
import gensim

from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from textblob import TextBlob
from readability import Readability

import warnings
warnings.filterwarnings('ignore')

In [None]:
# A list of contractions from http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = { 
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'll": "i will",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'll": "it will",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "must've": "must have",
    "mustn't": "must not",
    "needn't": "need not",
    "oughtn't": "ought not",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "she'd": "she would",
    "she'll": "she will",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "that'd": "that would",
    "that's": "that is",
    "there'd": "there had",
    "there's": "there is",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'll": "we will",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "where'd": "where did",
    "where's": "where is",
    "who'll": "who will",
    "who's": "who is",
    "won't": "will not",
    "wouldn't": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are"
}

stop_words = set(stopwords.words('english'))
# stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stemmer = None
# lemmatizer = None

TEST_SIZE = 0.1
SMOOTHING = 1.0
NGRAM_RANGE = (1, 1)

## Helper Functions

In [None]:
def preprocess(sentence, lower_case=True, remove_punctuation=True, replace_contractions=True):
    if lower_case:
        sentence = sentence.lower()
    if remove_punctuation:
        sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    if replace_contractions:
        news = sentence.split()
        new_news = []
        for word in news:
            if word in contractions:
                new_news.append(contractions[word])
            else:
                new_news.append(word)
        sentence = " ".join(new_news)
    return sentence

In [None]:
def tokenize(sentence, stemmer=stemmer, lemmatizer=lemmatizer, remove_stop_words=False):
    tokens = word_tokenize(sentence)
    
    if remove_stop_words:
        tokens = [token for token in tokens if token not in stop_words]
    if stemmer:
        tokens = [stemmer.stem(token) for token in tokens]
    if lemmatizer:
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        
    return tokens

## Load Data

In [None]:
full_train_df = pd.read_csv('raw_data/fulltrain.csv', header=None)
full_train_df.columns = ['label', 'text']
full_train_df.head()

In [None]:
train_df = full_train_df.drop_duplicates(subset=['text'])
print(f"No. training samples (all classes): {len(train_df)}")

## Feature Engineering

In [None]:
def contraction_count(text):
    count = 0
    for contract in contractions:
        count += re.subn(contract, '', text)[1]
    return count

def count_stopwords(text):  
    stopwords = nltk.corpus.stopwords.words("english")
    word_tokens = word_tokenize(text)
    stopwords_list = [w for w in word_tokens if w in stopwords]
    return len(stopwords_list)

def count_special_characters(text):
    count = 0
    for i in range(len(text)):
        if(not text[i].isalpha() and not text[i].isdigit()):
            count += 1
    return count

def count_uppercase(text):
    count = 0
    for i in range(len(text)):
        if(text[i].isupper()):
            count += 1
    return count

def count_lowercase(text):
    count = 0
    for i in range(len(text)):
        if(text[i].islower()):
            count += 1
    return count

def count_noun_verb_adverb_adjective(text):
    word_tokens = word_tokenize(text)
    word_tagged = pos_tag(word_tokens, tagset='universal')
    noun_count = 0
    verb_count = 0
    adv_count = 0
    adj_count = 0
    for pair in word_tagged:
        tag = pair[1]
        if tag == 'NOUN':
            noun_count += 1
        elif tag == 'VERB':
            verb_count += 1
        elif tag == 'ADV':
            adv_count += 1
        elif tag == 'ADJ':
            adj_count += 1
    return (noun_count, verb_count, adv_count, adj_count)

def count_syllable(text):
    words = word_tokenize(text)
    count = 0
    vowels = "aeiouy"
    for word in words:
        count += count_syllable_in_word(word, vowels)
    return count

def count_syllable_in_word(word, vowels):
    count = 0
    word = word.lower()
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count += 1
    return count

def get_gunning_fog_grade_index(text):
    # 0.4 [(words/sentences) + 100 (complex words/words)]
    # Complex words are those containing three or more syllables.
    sents = sent_tokenize(text)
    words = word_tokenize(text)
    num_sents = len(sents)
    num_words = len(words)
    num_complex_words = 0
    for word in words:
        if count_syllable_in_word(word, "aeiouy") >= 3:
            num_complex_words += 1
    return 0.4*(num_words/num_sents) + 100*(num_complex_words/num_words)
    
def get_dale_chall_readability_coleman_liau_index(text):
    try:
        r = Readability(text)
        return (r.dale_chall(), r.coleman_liau())
    except:
        return (0, 0)

In [None]:
# Complexity Features
train_df['num_stopwords'] = train_df['text'].apply(lambda x: count_stopwords(x))
train_df['num_sentences'] = train_df['text'].apply(lambda x: len(str(x).split('.')))
train_df['num_contractions'] = train_df['text'].apply(lambda x: contraction_count(x))
train_df['num_special_characters'] = train_df['text'].apply(lambda x: count_special_characters(x))
train_df['num_uppercase'] = train_df['text'].apply(lambda x: count_uppercase(x))
train_df['num_lowercase'] = train_df['text'].apply(lambda x: count_lowercase(x))

In [None]:
# Stylometric Features
train_df['pos_tags'] = train_df['text'].apply(lambda x: count_noun_verb_adverb_adjective(x))
train_df['num_noun'] = train_df['pos_tags'].apply(lambda x: x[0])
train_df['num_verb'] = train_df['pos_tags'].apply(lambda x: x[1])
train_df['num_adverb'] = train_df['pos_tags'].apply(lambda x: x[2])
train_df['num_adjective'] = train_df['pos_tags'].apply(lambda x: x[3])
train_df['num_syllables'] = train_df['text'].apply(lambda x: count_syllable(x))

In [None]:
# Readability based evidence
train_df['gunning_fog_grade_index'] = train_df['text'].apply(lambda x: get_gunning_fog_grade_index(x))
train_df['readability_indices'] = train_df['text'].apply(lambda x: get_dale_chall_readability_coleman_liau_index(x))
train_df['dale_chall_readability'] = train_df['readability_indices'].apply(lambda x: x[0])
train_df['coleman_liau_index'] = train_df['readability_indices'].apply(lambda x: x[1])

In [None]:
# Psycho-linguistic features
train_df['psycho-linguistic'] = train_df['text'].apply(lambda x: TextBlob(x).sentiment)
train_df['polarity'] = train_df['psycho-linguistic'].apply(lambda x: x[0])
train_df['subjectivity'] = train_df['psycho-linguistic'].apply(lambda x: x[1])

In [None]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=NGRAM_RANGE, smooth_idf=True, preprocessor=preprocess, tokenizer=tokenize, token_pattern=None)

In [None]:
features = ['num_stopwords', 'num_sentences', 'num_contractions', 'num_special_characters',
            'num_uppercase', 'num_lowercase',
            'num_noun', 'num_verb', 'num_adverb', 'num_adjective', 'num_syllables',
            'gunning_fog_grade_index', 'dale_chall_readability', 'coleman_liau_index',
            'polarity', 'subjectivity', ]
y = train_df['label'].values
train_tfidf = tfidf_vectorizer.fit_transform(train_df['text'].values).toarray()
train_tfidf = pd.DataFrame(train_tfidf)

In [None]:
X_train_tfidf = pd.merge(train_tfidf, train_df[features], left_index=True, right_index=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, y, test_size=TEST_SIZE, stratify=y, random_state=42)

## Logistic Regression

In [None]:
clf = LogisticRegression(random_state=0, max_iter=200).fit(X_train_tfidf, y_train)
y_pred = clf.predict(X_train_tfidf)
print(f"Accuracy: {accuracy_score(y_train, y_pred)}")
print(f"F1 score: {f1_score(y_train, y_pred, average='macro')}")

### Testing

In [None]:
X_test_tfidf = tfidf_vectorizer.transform(X_test)
y_pred = clf.predict(X_test_tfidf)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"F1 score: {f1_score(y_test, y_pred, average='macro')}")

### Validation

In [None]:
test_df = pd.read_csv('raw_data/balancedtest.csv', header=None, names=['label', 'text'])
print(f"No. test samples (all classes): {len(test_df)}")
test_df.sample(5)

In [None]:
X_val = test_df['text'].values
y_val = test_df['label'].values
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_val_tfidf.shape

In [None]:
y_pred_val = clf.predict(X_val_tfidf)
print(f"Accuracy: {accuracy_score(y_val, y_pred_val)}")
print(f"F1 score: {f1_score(y_val, y_pred_val, average='macro')}")

In [None]:
print(classification_report(y_val, y_pred_val, target_names=['satire', 'hoax', 'propaganda', 'reliable']))

In [None]:
cm = confusion_matrix(y_val, y_pred_val)
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt="d")
plt.title("Confusion matrix")
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()	