In [1]:
import os
import pickle 
import numpy as np
import pandas as pd
import string

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import spacy

# from nltk import word_tokenize, sent_tokenize
from string import punctuation
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fatha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\fatha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\fatha\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
spacy.prefer_gpu()

True

In [4]:
nlp = spacy.load("en_core_web_sm")

In [5]:
# column titles and label maps
columns = ['id','label','statement','subject','speaker','job title','state info','party','barely true','false','half-true','mostly-true','pants-on-fire','context']
label_map = {'pants-fire':-3, 'false':-2, 'barely-true':-1, 'half-true':1, 'mostly-true':2, 'true':3}

In [30]:
# load dataset files as pandas dataframes
train_data = pd.read_csv('train.tsv', sep='\t', header=None, names=columns)
val_data = pd.read_csv('valid.tsv', sep='\t', header=None, names=columns)
test_data = pd.read_csv('test.tsv', sep='\t', header=None, names=columns)

train_data.head()

Unnamed: 0,id,label,statement,subject,speaker,job title,state info,party,barely true,false,half-true,mostly-true,pants-on-fire,context
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN


# Preprocessing

## Functions

### Sunday 12 March

TO DO FOR the WEEK: <br>
    - add function to remove stopwords from pos_id X
1. og statement -> get pos -> pos_id
	- train_data['pos_id'] = train_data['statement'].apply(get_pos)

2. og statement -> # preprocess statement: words lemmatized and stopwords are removed -> pred_statement
	- train_data['pred_statement'] = train_data['statement'].apply(preprocess_statement)

3. pred_statement -> add to vocab dict: add preprocessed statements to vocab dict -> vocab_dict
	- vocabulary_dict = load_statement_vocab_dict(train_data)

4. vocab_dict, pred_statement -> getWordID(): turn preprocessed statements to vector -> val
	- train_data['word_id'] = train_data['pred_statement'].apply(lambda x: getWordId(x, vocabulary_dict)

**NOTE** it is generally fine to add pos tags without stopwords removed.
	in siddarthhari, padding used to put pos tags as separate input layer.

15 mar: <br>
    - extracted getWordId() from preprocess_statement()
    


### 19 March
TODO: 
1. POS tags - incorporate stop words into pos tags 
- update: no need since separate input.
2. preprocess_statement() - configure best ones. confirm lemmatizer works

In [7]:
pos_tags = {'ADJ': 'adjective', 'ADP': 'adposition', 'ADV': 'adverb',
            'AUX': 'auxiliary verb', 'CONJ': 'coordinating conjunction',
            'DET': 'determiner', 'INTJ': 'interjection', 'NOUN': 'noun',
            'NUM': 'numeral', 'PART': 'particle', 'PRON': 'pronoun',
            'PROPN': 'proper noun', 'PUNCT': 'punctuation', 'X': 'other',
            'SCONJ': 'subordinating conjunction', 'SYM': 'symbol', 'VERB': 'verb'}

# proposed pos_dict: values correspond to pos tag importance
dict1 = {'NOUN': 0, 'VERB': 1, 'ADJ': 2, 'ADV': 3, 'ADP': 4, 'PRON': 5,
            'NUM': 6, 'SCONJ': 7, 'PROPN': 8, 'CONJ': 9, 'PUNCT': 10, 'AUX': 11,
            'PART': 12, 'INTJ': 13, 'DET': 14, 'SYM': 14, 'X': 14}

dict2 = {'ADJ': 0, 'ADP': 1, 'ADV': 2, 'AUX': 3, 'CONJ': 4,
            'DET': 5, 'INTJ': 6, 'NOUN': 7, 'NUM': 8, 'PART': 9,
            'PRON': 10, 'PROPN': 11, 'PUNCT': 12, 'X': 13,
            'SCONJ': 14, 'SYM': 15, 'VERB': 16}

In [None]:
def get_pos(statement, pos_dict):
    doc = nlp(statement)
    taglist = []
    for token in doc:
        # labels stopwords from list of pos tags as 12
        # if token.is_stop:
        #     taglist.append(12)
        # else:
            taglist.append(pos_dict.get(token.pos_, max(pos_dict.values())))
    return taglist

In [8]:
# words lemmatized and stopwords are removed
def preprocess_statement(statement):
    doc = nlp(statement)
    words = []
    for token in doc:
        # option to remove stopwords, punctuation, or specific POS tags from word list
        if not token.is_punct and not token.is_space and \
                token.pos_ not in ['SYM', 'DET', 'X', 'PRON', 'PART', 'CONJ', 'ADP']: # and not token.is_stop:
            words.append(token.lemma_.lower()) # lemmatize and lowercase
    return ' '.join(words)

In [28]:
def preprocess_statement2(statement, exclude_pos=None, remove_stopwords=False):
# lemmatizes and lowercases tokens that are not punctuation or whitespace.
# params: statement - string
#         exclude_pos - list of POS tags to exclude from word list
#         remove_stopwords - boolean to remove stopwords from word list
# returns: tokens joined into a string
    doc = nlp(statement)
    words = []
    for token in doc:
        # check if token should be excluded based on part of speech and stopword status
        if exclude_pos and token.pos_ in exclude_pos:
            continue
        if remove_stopwords and token.is_stop:
            continue
        # only include tokens that are not punctuation or whitespace
        if not token.is_punct and not token.is_space:
            words.append(token.lemma_.lower()) # lemmatize and lowercase
    return ' '.join(words)

In [9]:
# checks if vocabulary.p exists, if not creates it
def load_statement_vocab_dict(train_data):
    if not os.path.exists('vocabulary.p'):
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(train_data['pred_statement'])
        vocabulary_dict = tokenizer.word_index
        print(len(vocabulary_dict))
        with open("vocabulary.p", "wb") as f:
            pickle.dump(vocabulary_dict, f)
        print('Created Vocabulary Dictionary...')
        print('Saved Vocabulary Dictionary...')
    else:
        print('Loading Vocabulary Dictionary...')
        with open("vocabulary.p", "rb") as f:
            vocabulary_dict = pickle.load(f)
    return vocabulary_dict

In [1]:
# checks for vocabulary.p, if exists delete and update
def create_statement_vocab_dict(train_data):

    if os.path.exists('vocabulary.p'):
        os.remove('vocabulary.p')
        print('Deleted existing Vocabulary Dictionary...')

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train_data['pred_statement'])
    vocabulary_dict = tokenizer.word_index
    print(len(vocabulary_dict))
    with open("vocabulary.p", "wb") as f:
        pickle.dump(vocabulary_dict, f)
    print('Created Vocabulary Dictionary...')
    print('Saved Vocabulary Dictionary...')

    return vocabulary_dict

In [10]:
# preprocessed statement to vector
def getWordId(pred_statement, vocabulary_dict):
    text = text_to_word_sequence(pred_statement)
    val = [0] * 10
    val = [vocabulary_dict[t] for t in text if t in vocabulary_dict]
    return val

## DataFrame Cleaning

- train_data['label'] -> num values
- drops all rows where 'subject' and 'speaker' columns are missing
- drops all rows that have missing values in 'statement'
- drops context column

<h3>Mar 19 todo:</h3>

- refactor to single function
- instead of dropping rows, replace missing values in 'subject' and 'speaker' as empty string

In [31]:
def clean_data(data, label_map):
    # maps labels to numerical values
    data['label'] = data['label'].map(label_map)

    # replace missing values in 'subject' and 'speaker' with empty strings
    data['subject'] = data['subject'].fillna('')
    data['speaker'] = data['speaker'].fillna('')

    # drop rows where there are missing values in 'statement'
    data.drop(index=data[data.statement==' '].index, inplace=True)
    data.drop(index=data[data.statement=='  '].index, inplace=True)
    data.drop(index=data[data.statement=='\n'].index, inplace=True)

    # drop context column
    data.drop(columns=['context'], inplace=True)

    # reset index
    data = data.reset_index()

    return data

In [32]:
train_data = clean_data(train_data, label_map)
val_data = clean_data(val_data, label_map)
test_data = clean_data(test_data, label_map)

In [13]:
train_data.head()

Unnamed: 0,index,id,label,statement,subject,speaker,job title,state info,party,barely true,false,half-true,mostly-true,pants-on-fire
0,0,2635.json,-2,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0
1,1,10540.json,1,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0
2,2,324.json,2,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0
3,3,1123.json,-2,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0
4,4,9028.json,1,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0


In [14]:
print('real news count')
train_data[train_data['label']>0].count()

real news count


index            5752
id               5752
label            5752
statement        5752
subject          5752
speaker          5752
job title        4264
state info       4663
party            5752
barely true      5752
false            5752
half-true        5752
mostly-true      5752
pants-on-fire    5752
dtype: int64

### get pos tag

In [None]:
pos_tags = {'ADJ': 'adjective', 'ADP': 'adposition', 'ADV': 'adverb',
            'AUX': 'auxiliary verb', 'CONJ': 'coordinating conjunction',
            'DET': 'determiner', 'INTJ': 'interjection', 'NOUN': 'noun',
            'NUM': 'numeral', 'PART': 'particle', 'PRON': 'pronoun',
            'PROPN': 'proper noun', 'PUNCT': 'punctuation', 'X': 'other',
            'SCONJ': 'subordinating conjunction', 'SYM': 'symbol', 'VERB': 'verb'}

# proposed pos_dict: values correspond to pos tag importance
dict1 = {'NOUN': 0, 'VERB': 1, 'ADJ': 2, 'ADV': 3, 'ADP': 4, 'PRON': 5,
            'NUM': 6, 'SCONJ': 7, 'PROPN': 8, 'CONJ': 9, 'PUNCT': 10, 'AUX': 11,
            'PART': 12, 'INTJ': 13, 'DET': 14, 'SYM': 14, 'X': 14}

dict2 = {'ADJ': 0, 'ADP': 1, 'ADV': 2, 'AUX': 3, 'CONJ': 4,
            'DET': 5, 'INTJ': 6, 'NOUN': 7, 'NUM': 8, 'PART': 9,
            'PRON': 10, 'PROPN': 11, 'PUNCT': 12, 'X': 13,
            'SCONJ': 14, 'SYM': 15, 'VERB': 16}

In [38]:
train_data['pos_id'] = train_data['statement'].apply(get_pos, pos_dict=dict1)
# val_data['pos_id'] = val_data['statement'].apply(get_pos, pos_dict=dict1)
# test_data['pos_id'] = test_data['statement'].apply(get_pos, pos_dict=dict1)

In [None]:
train_data.head()

### preprocess statements, get word_id

In [33]:
prep_pos_dict = ['SYM', 'DET', 'X', 'PRON', 'PART', 'CONJ', 'ADP']
# preprocess_statement2(train_data['statement'], exclude_pos=prep_pos_dict, remove_stopwords=True)

In [34]:
train_data['pred_statement'] = train_data['statement'].apply(preprocess_statement2, exclude_pos=prep_pos_dict, remove_stopwords=True)
# val_data['pred_statement'] = val_data['statement'].apply(preprocess_statement)
# test_data['pred_statement'] = test_data['statement'].apply(preprocess_statement)
# train_data.head()

In [35]:
vocabulary_dict = load_statement_vocab_dict(train_data)

9459
Created Vocabulary Dictionary...
Saved Vocabulary Dictionary...


In [36]:
train_data['word_id'] = train_data['pred_statement'].apply(lambda x: getWordId(x, vocabulary_dict))
# val_data['word_id'] = val_data['pred_statement'].apply(lambda x: getWordId(x, vocabulary_dict))
# test_data['word_id'] = test_data['pred_statement'].apply(lambda x: getWordId(x, vocabulary_dict))
# train_data.drop('pred_statement', axis=1, inplace=True)

In [39]:
train_data.head()

Unnamed: 0,index,id,label,statement,subject,speaker,job title,state info,party,barely true,false,half-true,mostly-true,pants-on-fire,pred_statement,word_id,pos_id
0,0,2635.json,-2,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,say annies list political group support trimes...,"[1, 5297, 632, 420, 330, 36, 3906, 117, 931]","[1, 14, 8, 8, 2, 0, 1, 2, 10, 0, 0, 4, 0, 10]"
1,1,10540.json,1,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,decline coal start start natural gas take star...,"[718, 771, 246, 246, 886, 201, 45, 246, 525, 1...","[7, 1, 14, 0, 4, 0, 0, 10, 5, 1, 7, 2, 0, 1, 4..."
2,2,324.json,2,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,hillary clinton agree john mccain vote george ...,"[73, 48, 648, 122, 155, 12, 209, 102, 205, 273...","[8, 8, 1, 4, 8, 8, 10, 4, 1, 12, 1, 8, 8, 14, ..."
3,3,1123.json,-2,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,health care reform legislation likely mandate ...,"[13, 15, 163, 200, 512, 340, 299, 370, 137, 2738]","[0, 0, 0, 0, 11, 2, 12, 1, 2, 0, 0, 0, 10]"
4,4,9028.json,1,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,economic turnaround start end term,"[222, 3196, 246, 196, 197]","[14, 2, 0, 1, 4, 14, 0, 4, 5, 0, 10]"


### compare preprocessed statements with original 

In [None]:
print(train_data['statement']) 
print(train_data['pred_statement'])

In [None]:
print(train_data.loc[25,'statement'])
print(train_data.loc[25,'pred_statement'])

In [40]:
def print_row_info(row_num, df):
    print(f"#### label #####\n{df.loc[row_num, 'label']}")
    print(f"#### original statement #####\n{df.loc[row_num, 'statement']}")
    print(f"#### pos_id #####\n{df.loc[row_num, 'pos_id']}")
    print(f"#### preprocessed statement #####\n{df.loc[row_num, 'pred_statement']}")
    print(f"#### word_id #####\n{df.loc[row_num, 'word_id']}")

In [43]:
print_row_info(2000, train_data)

#### label #####
3
#### original statement #####
At Bain Capital, we helped start an early childhood learning company called Bright Horizons that First Lady Michelle Obama rightly praised.
#### pos_id #####
[4, 8, 8, 10, 5, 1, 1, 14, 2, 0, 1, 0, 1, 8, 8, 5, 8, 8, 8, 8, 3, 1, 10]
#### preprocessed statement #####
bain capital help start early childhood learn company call bright horizons lady michelle obama rightly praise
#### word_id #####
[2419, 654, 146, 246, 402, 1694, 1406, 107, 187, 3207, 5326, 2157, 1235, 5, 5327, 2420]


In [27]:
# 22 mar 11:25 AM
print_row_info(61, train_data)

#### label #####
2
#### original statement #####
Said he's the only Republican candidate "who's actually turned around a government economy."
#### preprocessed statement #####
say he be only republican candidate who be actually turn around government economy
#### word_id #####
[6, 25, 1, 65, 100, 181, 52, 1, 227, 494, 580, 71, 220]


In [21]:
# 21 mar 1:20 pm
print_row_info(2000, train_data)

#### label #####
2
#### original statement #####
The DREAM Act was written by members of both parties. When it came up for a vote a year and a half ago, Republicans in Congress blocked it. The bill hadnt changed. ... The only thing that had changed was politics.
#### preprocessed statement #####
dream act be write by member of party when it come up for vote year and half ago republicans in congress block it bill have not change only thing that have change be politic
#### word_id #####
[1390, 209, 1, 551, 19, 243, 3, 257, 62, 20, 123, 78, 8, 35, 12, 7, 236, 327, 223, 2, 150, 753, 20, 51, 5, 11, 217, 65, 451, 9, 5, 217, 1, 1612]


In [None]:
# 21 mar 10:00 am
print_row_info(100, train_data)

In [None]:
# removed det, sym, etc from pred_statement
print_row_info(2000, train_data)

In [None]:
# stop words = 17
print_row_info(100, train_data)

In [None]:
# print(train_data['pred_statement']) 
ROW = 101
print(f"original statements:\n{train_data.loc[ROW, 'statement']}")
print(f"pos tag ids:\n{train_data.loc[ROW, 'pos_id']}") 
print(f"preprocessed statements:\n{train_data.loc[ROW, 'pred_statement']}")
print(f"pred_statements vectorized:\n{train_data.loc[ROW, 'word_id']}")
print(train_data.loc[ROW, 'label'])

In [None]:
# # if not token.is_punct and not token.is_space
# ROW = 100
# print(f"original statements:\n{train_data.loc[ROW, 'statement']}")
# print(f"pos tag ids:\n{train_data.loc[ROW, 'pos_id']}")
# print(f"preprocessed statements:\n{train_data.loc[ROW, 'pred_statement']}")
# print(f"pred_statements vectorized:\n{train_data.loc[ROW, 'word_id']}")
# print(train_data.loc[ROW, 'label'])

In [None]:
# # if not token.is_punct and not token.is_space and token.pos_ not in ['DET', 'ADP', 'CONJ', 'PRON']: # and not token.is_stop:
# ROW = 4
# print(f"original statements:\n{train_data.loc[ROW, 'statement']}")
# print(f"pos tag ids:\n{train_data.loc[ROW, 'pos_id']}")
# print(f"preprocessed statements:\n{train_data.loc[ROW, 'pred_statement']}")
# print(f"pred_statements vectorized:\n{train_data.loc[ROW, 'word_id']}")
# print(train_data.loc[ROW, 'label'])