# CS4248 Project (Labelled Unreliable News)

In [49]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from collections import defaultdict
import re

class NewsClassifier:
    def __init__(self, path='./raw_data/fulltrain.csv', remove_punctuation=False):
        # Read CSV file in
        df = pd.read_csv(path, header=None)
        self.df = df
        self.remove_punctuation = remove_punctuation

        # Stopwords
        self.stopwords_set = set(stopwords.words('english'))
        # Vocabulary sets - Just lists the unique
        self.satire_vocabulary = set()
        self.hoax_vocabulary = set()
        self.propaganda_vocabulary = set()
        self.reliable_vocabulary = set()

        self.satire_vocab_dictionary = defaultdict(lambda: 0)
        self.hoax_vocab_dictionary = defaultdict(lambda: 0)
        self.propaganda_vocab_dictionary = defaultdict(lambda: 0)
        self.reliable_vocab_dictionary = defaultdict(lambda: 0)

    def get_data(self):
        df = self.df
        label_counts = self.count_labels()

        # Total number of words
        raw_word_count_dict = {1: 0, 2: 0, 3: 0, 4: 0}
        # Total number of words without stopwords
        stopword_count_dict = {1: 0, 2: 0, 3: 0, 4: 0}
        
        # Do the actual data processing
        for i, row in df.iterrows():
            print('Parsing row number: ' + str(i), end = "\r")
            label = int(row[0])
            text = row[1]
            # List of tokens for the sentence
            tokens = self.tokenise_text(text)
            raw_word_count_dict[label] += len(tokens)

            # Will filter out all punctuation - Punctuation is important in this context (!, ? has sentiment)
            if self.remove_punctuation:
                depunctuated_tokens = []
                for token in tokens:
                    if re.match(r'^[\W|_]*$', token):
                        continue
                    depunctuated_tokens.append(token)
                tokens = depunctuated_tokens

            # Can think of doing normalisation/stemming over here before removing stopwords
            # POS tagging - Noun/Pronoun. Looking at 3rd person/1st person POV words

            # Count and remove stopwords
            count_and_remove_stopword_result = self.count_and_remove_stopwords(tokens)
            stopword_count_dict[label] += count_and_remove_stopword_result[0]
            tokens_no_stopwords = count_and_remove_stopword_result[1]
            self.update_vocabulary(label, tokens_no_stopwords)

        # Print out all the acquired data
        print()
        print('Raw total count')
        print(raw_word_count_dict)
        print('Raw stopword count')
        print(stopword_count_dict)
        print()
        # Calculate averages
        for key, value in label_counts.items():
            raw_word_count_dict[key] /= value
            stopword_count_dict[key] /= value
        print('Average number of words')
        print(raw_word_count_dict)
        print('Average number of stop words')
        print(stopword_count_dict)

        print()
        print('Unique vocabulary count: ')
        print('Satire:' + str(len(self.satire_vocabulary)))
        print('Hoax:' +str(len(self.hoax_vocabulary)))
        print('Propaganda:' + str(len(self.propaganda_vocabulary)))
        print('Reliable:' +str(len(self.reliable_vocabulary)))
        print()
        print('Sorting dictionaries...')
        sorted_satire = sorted(self.satire_vocab_dictionary.items(), key=lambda x: x[1], reverse=True)
        sorted_hoax = sorted(self.hoax_vocab_dictionary.items(), key=lambda x: x[1], reverse=True)
        sorted_propaganda = sorted(self.propaganda_vocab_dictionary.items(), key=lambda x: x[1], reverse=True)
        sorted_reliable = sorted(self.reliable_vocab_dictionary.items(), key=lambda x: x[1], reverse=True)
        print('Sorting done!')
        # Get top 50 most common words
        sorted_satire = sorted_satire[:50]
        sorted_hoax = sorted_hoax[:50]
        sorted_propaganda = sorted_propaganda[:50]
        sorted_reliable = sorted_reliable[:50]
        print('Satire:')
        print(sorted_satire)
        print('Hoax:')
        print(sorted_hoax)
        print('Propaganda:')
        print(sorted_propaganda)
        print('Reliable:')
        print(sorted_reliable)
    
    # Counts the number of each label in the dataset
    def count_labels(self):
        df = self.df
        print('Number of rows: ' + str(df.shape[0]))
        label_count_dict = {1: 0, 2: 0, 3: 0, 4: 0}
        for i, row in df.iterrows():
            label = int(row[0])
            label_count_dict[label] += 1
            # Row 1 == Text
            # print(f"{row[1]}")
        print('SATIRE, HOAX, PROPAGANDA, RELIABLE NEWS')
        print(label_count_dict)
        return label_count_dict

    # Updates a set of unique vocabulary
    def update_vocabulary(self, label, tokens):
        if label == 1:
            self.satire_vocabulary.update(set(tokens))
            for token in tokens:
                self.satire_vocab_dictionary[token] += 1
        elif label == 2:
            self.hoax_vocabulary.update(set(tokens))
            for token in tokens:
                self.hoax_vocab_dictionary[token] += 1
        elif label == 3:
            self.propaganda_vocabulary.update(set(tokens))
            for token in tokens:
                self.propaganda_vocab_dictionary[token] += 1
        elif label == 4:
            self.reliable_vocabulary.update(set(tokens))
            for token in tokens:
                self.reliable_vocab_dictionary[token] += 1
        pass


    # Returns number of stopwords in a sentence and the set of tokens without stopwords. Uses NLTK stopwords library
    def count_and_remove_stopwords(self, tokens):
        stopword_count = 0
        new_token_list = []
        for token in tokens:
            if token in self.stopwords_set:
                stopword_count += 1
                continue
            new_token_list.append(token)
        return (stopword_count, new_token_list)
    
    # Returns a list of tokens - Utilises NLTK tokenizer
    def tokenise_text(self, text):
        raw_text = text.lower()
        # raw_text = 'Mrs. Brown said, "But where are you going now?" ... <h3> U.S.A'
        return nltk.word_tokenize(raw_text)

    def test_function(self):
        raw_text = 'Mrs. Brown said that she will bring us to hear his speech whose who it their they ... <h3> U.S.A'
        # raw_text = 'The the the guys said that it is what it is haha! haha haha haha What are you doing today?'
        tokens = self.tokenise_text(raw_text)
        print(tokens)
        # Method 1 - Will not remove words like mrs. u.s.a
        depunctuated_tokens = []
        for token in tokens:
            if re.match(r'^[\W|_]*$', token):
                continue
            depunctuated_tokens.append(token)
        print('DEPUNCTUATED')
        print(depunctuated_tokens)

        
        tagged_tokens = nltk.pos_tag(depunctuated_tokens)
        print(tagged_tokens)
        for token in tagged_tokens:
            tag = token[1]


        # Method 2: Will remove the above words        
        # depunctuated_tokens = [word for word in tokens if word.isalpha()]
        result = self.count_and_remove_stopwords(depunctuated_tokens)
        print('NO MORE STOP WORDS')
        print(result[0])
        print(result[1])
        # print(self.stopwords_set)

        print()
        test_vocab_dictionary = defaultdict(lambda: 0)
        for word in result[1]:
            test_vocab_dictionary[word] += 1
        print(sorted(test_vocab_dictionary.items(), key=lambda x: x[1], reverse=True))




In [50]:
news_classifier = NewsClassifier(remove_punctuation=True)
# news_classifier.get_data()
news_classifier.test_function()

['mrs.', 'brown', 'said', 'that', 'she', 'will', 'bring', 'us', 'to', 'hear', 'his', 'speech', 'whose', 'who', 'it', 'their', 'they', '...', '<', 'h3', '>', 'u.s.a']
DEPUNCTUATED
['mrs.', 'brown', 'said', 'that', 'she', 'will', 'bring', 'us', 'to', 'hear', 'his', 'speech', 'whose', 'who', 'it', 'their', 'they', 'h3', 'u.s.a']
[('mrs.', 'NNS'), ('brown', 'VBN'), ('said', 'VBD'), ('that', 'IN'), ('she', 'PRP'), ('will', 'MD'), ('bring', 'VB'), ('us', 'PRP'), ('to', 'TO'), ('hear', 'VB'), ('his', 'PRP$'), ('speech', 'NN'), ('whose', 'WP$'), ('who', 'WP'), ('it', 'PRP'), ('their', 'PRP$'), ('they', 'PRP'), ('h3', 'VBP'), ('u.s.a', 'JJ')]
NO MORE STOP WORDS
9
['mrs.', 'brown', 'said', 'bring', 'us', 'hear', 'speech', 'whose', 'h3', 'u.s.a']

[('mrs.', 1), ('brown', 1), ('said', 1), ('bring', 1), ('us', 1), ('hear', 1), ('speech', 1), ('whose', 1), ('h3', 1), ('u.s.a', 1)]
