# CS4248 Project (Labelled Unreliable News)

In [61]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from collections import defaultdict
import re

class NewsClassifier:
    def __init__(self, path='./raw_data/fulltrain.csv', remove_punctuation=False):
        # Read CSV file in
        df = pd.read_csv(path, header=None)
        self.df = df
        self.remove_punctuation = remove_punctuation

        # Stopwords
        self.stopwords_set = set(stopwords.words('english'))
        # Vocabulary sets - Just lists the unique
        self.satire_vocabulary = set()
        self.hoax_vocabulary = set()
        self.propaganda_vocabulary = set()
        self.reliable_vocabulary = set()

        self.satire_vocab_dictionary = defaultdict(lambda: 0)
        self.hoax_vocab_dictionary = defaultdict(lambda: 0)
        self.propaganda_vocab_dictionary = defaultdict(lambda: 0)
        self.reliable_vocab_dictionary = defaultdict(lambda: 0)

        self.satire_pronouns_dictionary = {'Personal': 0, 'WH-Pronoun': 0, 'Total': 0}
        self.hoax_pronouns_dictionary = {'Personal': 0, 'WH-Pronoun': 0, 'Total': 0}
        self.propaganda_pronouns_dictionary = {'Personal': 0, 'WH-Pronoun': 0, 'Total': 0}
        self.reliable_pronouns_dictionary = {'Personal': 0, 'WH-Pronoun': 0, 'Total': 0}

    def get_data(self):
        df = self.df
        label_counts = self.count_labels()

        # Total number of words
        raw_word_count_dict = {1: 0, 2: 0, 3: 0, 4: 0}
        # Total number of words without stopwords
        stopword_count_dict = {1: 0, 2: 0, 3: 0, 4: 0}
        
        # Do the actual data processing
        for i, row in df.iterrows():
            print('Parsing row number: ' + str(i), end = "\r")
            label = int(row[0])
            text = row[1]
            # List of tokens for the sentence
            tokens = self.tokenise_text(text)
            raw_word_count_dict[label] += len(tokens)

            # Will filter out all punctuation - Punctuation is important in this context (!, ? has sentiment)
            if self.remove_punctuation:
                depunctuated_tokens = []
                for token in tokens:
                    if re.match(r'^[\W|_]*$', token):
                        continue
                    depunctuated_tokens.append(token)
                tokens = depunctuated_tokens

            # Can think of doing normalisation/stemming over here before removing stopwords
            # POS tagging - Noun/Pronoun. Expand: Looking at 3rd person/1st person POV words
            tagged_tokens = nltk.pos_tag(tokens)
            self.count_pronouns(tagged_tokens, label)

            # Count and remove stopwords
            count_and_remove_stopword_result = self.count_and_remove_stopwords(tokens)
            stopword_count_dict[label] += count_and_remove_stopword_result[0]
            tokens_no_stopwords = count_and_remove_stopword_result[1]
            self.update_vocabulary(label, tokens_no_stopwords)

        # Print out all the acquired data
        print()
        print('Raw total count')
        print(raw_word_count_dict)
        print('Raw stopword count')
        print(stopword_count_dict)
        print()
        # Calculate averages
        for key, value in label_counts.items():
            raw_word_count_dict[key] /= value
            stopword_count_dict[key] /= value
        print('Average number of words')
        print(raw_word_count_dict)
        print('Average number of stop words')
        print(stopword_count_dict)

        print()
        print('=== Unique vocabulary count: ===')
        print('Satire:' + str(len(self.satire_vocabulary)))
        print('Hoax:' +str(len(self.hoax_vocabulary)))
        print('Propaganda:' + str(len(self.propaganda_vocabulary)))
        print('Reliable:' +str(len(self.reliable_vocabulary)))
        print()
        print('Sorting dictionaries...')
        sorted_satire = sorted(self.satire_vocab_dictionary.items(), key=lambda x: x[1], reverse=True)
        sorted_hoax = sorted(self.hoax_vocab_dictionary.items(), key=lambda x: x[1], reverse=True)
        sorted_propaganda = sorted(self.propaganda_vocab_dictionary.items(), key=lambda x: x[1], reverse=True)
        sorted_reliable = sorted(self.reliable_vocab_dictionary.items(), key=lambda x: x[1], reverse=True)
        print('Sorting done!')
        # Get top 50 most common words
        sorted_satire = sorted_satire[:50]
        sorted_hoax = sorted_hoax[:50]
        sorted_propaganda = sorted_propaganda[:50]
        sorted_reliable = sorted_reliable[:50]
        print('=== Top 50 most common words: ===')
        print('Satire:')
        print(sorted_satire)
        print('Hoax:')
        print(sorted_hoax)
        print('Propaganda:')
        print(sorted_propaganda)
        print('Reliable:')
        print(sorted_reliable)
        print()
        print('=== Pronoun Counts (Averages): ===')
        print('Satire:')
        print(self.satire_pronouns_dictionary)
        print(self.satire_pronouns_dictionary['Personal'] / label_counts[1])
        print(self.satire_pronouns_dictionary['WH-Pronoun'] / label_counts[1])
        print(self.satire_pronouns_dictionary['Total'] / label_counts[1])
        print('Hoax:')
        print(self.hoax_pronouns_dictionary)
        print(self.hoax_pronouns_dictionary['Personal'] / label_counts[2])
        print(self.hoax_pronouns_dictionary['WH-Pronoun'] / label_counts[2])
        print(self.hoax_pronouns_dictionary['Total'] / label_counts[2])
        print('Propaganda:')
        print(self.propaganda_pronouns_dictionary)
        print(self.propaganda_pronouns_dictionary['Personal'] / label_counts[3])
        print(self.propaganda_pronouns_dictionary['WH-Pronoun'] / label_counts[3])
        print(self.propaganda_pronouns_dictionary['Total'] / label_counts[3])
        print('Reliable:')
        print(self.reliable_pronouns_dictionary)
        print(self.reliable_pronouns_dictionary['Personal'] / label_counts[4])
        print(self.reliable_pronouns_dictionary['WH-Pronoun'] / label_counts[4])
        print(self.reliable_pronouns_dictionary['Total'] / label_counts[4])
    
    # Counts the number of each label in the dataset
    def count_labels(self):
        df = self.df
        print('Number of rows: ' + str(df.shape[0]))
        label_count_dict = {1: 0, 2: 0, 3: 0, 4: 0}
        for i, row in df.iterrows():
            label = int(row[0])
            label_count_dict[label] += 1
            # Row 1 == Text
            # print(f"{row[1]}")
        print('SATIRE, HOAX, PROPAGANDA, RELIABLE NEWS')
        print(label_count_dict)
        return label_count_dict

    # Returns the 2 different counts of pronouns. Receives a list of tagged_tokens (tuples)
    def count_pronouns(self, tagged_tokens, label):
        # PRP personal pronoun I, he, she 
        # PRP$ possessive pronoun my, his, hers
        # WP wh-pronoun who, what
        # WP$ possessive wh-pronoun whose
        personal_pronoun_list = ['PRP', 'PRP$']
        personal_pronoun_set = set(personal_pronoun_list)
        wh_pronoun_list = ['WP', 'WP$']
        wh_pronoun_set = set(wh_pronoun_list)

        personal_pronoun_count = 0
        wh_pronoun_count = 0
        for token in tagged_tokens:
            tag = token[1]
            if tag in personal_pronoun_set:
                personal_pronoun_count += 1
            elif tag in wh_pronoun_set:
                wh_pronoun_count += 1
        total_count = personal_pronoun_count + wh_pronoun_count

        if label == 1:
            self.satire_pronouns_dictionary['Personal'] += personal_pronoun_count
            self.satire_pronouns_dictionary['WH-Pronoun'] += wh_pronoun_count
            self.satire_pronouns_dictionary['Total'] += total_count
        elif label == 2:
            self.hoax_pronouns_dictionary['Personal'] += personal_pronoun_count
            self.hoax_pronouns_dictionary['WH-Pronoun'] += wh_pronoun_count
            self.hoax_pronouns_dictionary['Total'] += total_count
        elif label == 3:
            self.propaganda_pronouns_dictionary['Personal'] += personal_pronoun_count
            self.propaganda_pronouns_dictionary['WH-Pronoun'] += wh_pronoun_count
            self.propaganda_pronouns_dictionary['Total'] += total_count
        elif label == 4:
            self.reliable_pronouns_dictionary['Personal'] += personal_pronoun_count
            self.reliable_pronouns_dictionary['WH-Pronoun'] += wh_pronoun_count
            self.reliable_pronouns_dictionary['Total'] += total_count
        pass

    # Updates a set of unique vocabulary
    def update_vocabulary(self, label, tokens):
        if label == 1:
            self.satire_vocabulary.update(set(tokens))
            for token in tokens:
                self.satire_vocab_dictionary[token] += 1
        elif label == 2:
            self.hoax_vocabulary.update(set(tokens))
            for token in tokens:
                self.hoax_vocab_dictionary[token] += 1
        elif label == 3:
            self.propaganda_vocabulary.update(set(tokens))
            for token in tokens:
                self.propaganda_vocab_dictionary[token] += 1
        elif label == 4:
            self.reliable_vocabulary.update(set(tokens))
            for token in tokens:
                self.reliable_vocab_dictionary[token] += 1
        pass

    # Returns number of stopwords in a sentence and the set of tokens without stopwords. Uses NLTK stopwords library
    def count_and_remove_stopwords(self, tokens):
        stopword_count = 0
        new_token_list = []
        for token in tokens:
            if token in self.stopwords_set:
                stopword_count += 1
                continue
            new_token_list.append(token)
        return (stopword_count, new_token_list)
    
    # Returns a list of tokens - Utilises NLTK tokenizer. Set to lowercase
    def tokenise_text(self, text):
        raw_text = text.lower()
        return nltk.word_tokenize(raw_text)




In [62]:
news_classifier = NewsClassifier(remove_punctuation=True)
news_classifier.get_data()

Number of rows: 48854
SATIRE, HOAX, PROPAGANDA, RELIABLE NEWS
{1: 14047, 2: 6942, 3: 17870, 4: 9995}
Parsing row number: 48853
Raw total count
{1: 4756605, 2: 1503599, 3: 18189450, 4: 5052139}
Raw stopword count
{1: 1722674, 2: 605984, 3: 7131322, 4: 1786363}

Average number of words
{1: 338.62070192923755, 2: 216.5944972630366, 3: 1017.876329043089, 4: 505.4666333166583}
Average number of stop words
{1: 122.63643482594148, 2: 87.29242293287237, 3: 399.06670397313934, 4: 178.72566283141572}

=== Unique vocabulary count: ===
Satire:120202
Hoax:36659
Propaganda:217713
Reliable:118406

Sorting dictionaries...
Sorting done!
=== Top 50 most common words: ===
Satire:
[('said', 34913), ("'s", 34890), ("n't", 11049), ('one', 10526), ('would', 10226), ('time', 10110), ('like', 9006), ('new', 7337), ('could', 6669), ('added', 6300), ('even', 6260), ('get', 6092), ('people', 5546), ('amp', 5240), ('monday', 5064), ('also', 5043), ('years', 4801), ('know', 4771), ('really', 4748), ('first', 4745),