In [1]:
import pandas as pd
import os.path
import pickle 
import numpy as np
import tensorflow.keras.utils

import time
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import CSVLogger
#from keras.callbacks import CSVLogger
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Flatten,LSTM,Conv1D,GlobalMaxPool1D,Dropout,Bidirectional
from tensorflow.keras.layers import Embedding
from tensorflow.keras import optimizers
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model
from IPython.display import SVG
from tensorflow.keras.utils import model_to_dot
from tensorflow.keras.models import load_model
from nltk.corpus import stopwords
import operator
import nltk
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
pd.options.display.max_rows
pd.set_option('display.max_colwidth', 1)
from tensorflow.keras import backend as K


In [4]:
train_data = pd.read_table('train.tsv', names = ["id", "label", "statement", "subject", "speaker", "job", "state", "party",
                                            "barely-true", "false", "half-true", "mostly-true", "pants-fire", "venue"])

val_data = pd.read_table('valid.tsv', names = ["id", "label", "statement", "subject", "speaker", "job", "state", "party",
                                            "barely-true", "false", "half-true", "mostly-true", "pants-fire", "venue"])

test_data = pd.read_table('test.tsv', names = ["id", "label", "statement", "subject", "speaker", "job", "state", "party",
                                            "barely-true", "false", "half-true", "mostly-true", "pants-fire", "venue"])

In [5]:
print (train_data.shape, val_data.shape, test_data.shape)
print (train_data.label.unique())
train_data.head()

(10240, 14) (1284, 14) (1267, 14)
['false' 'half-true' 'mostly-true' 'true' 'barely-true' 'pants-fire']


Unnamed: 0,id,label,statement,subject,speaker,job,state,party,barely-true,false,half-true,mostly-true,pants-fire,venue
0,2635.json,false,Says the Annies List political group supports third-trimester abortions on demand.,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started when natural gas took off that started to begin in (President George W.) Bushs administration.,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by voting to give George Bush the benefit of the doubt on Iran.""",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,false,Health care reform legislation is likely to mandate free sex change surgeries.,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,half-true,The economic turnaround started at the end of my term.,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN


In [6]:
print(train_data.info())
print (val_data.info())
print (test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10240 entries, 0 to 10239
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           10240 non-null  object 
 1   label        10240 non-null  object 
 2   statement    10240 non-null  object 
 3   subject      10238 non-null  object 
 4   speaker      10238 non-null  object 
 5   job          7343 non-null   object 
 6   state        8032 non-null   object 
 7   party        10238 non-null  object 
 8   barely-true  10238 non-null  float64
 9   false        10238 non-null  float64
 10  half-true    10238 non-null  float64
 11  mostly-true  10238 non-null  float64
 12  pants-fire   10238 non-null  float64
 13  venue        10138 non-null  object 
dtypes: float64(5), object(9)
memory usage: 1.1+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1284 entries, 0 to 1283
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype 
---  ------   

In [7]:
y_label_dict = {"pants-fire" : 0, "false" : 1, "barely-true" : 2, "half-true" : 3, "mostly-true" : 4, "true" : 5}
print (y_label_dict)

train_data['output'] = train_data['label'].apply(lambda x: y_label_dict[x])
val_data['output'] = val_data['label'].apply(lambda x: y_label_dict[x])
test_data['output'] = test_data['label'].apply(lambda x: y_label_dict[x])

num_classes = 6

{'pants-fire': 0, 'false': 1, 'barely-true': 2, 'half-true': 3, 'mostly-true': 4, 'true': 5}


In [8]:
frequent_speakers = {'barack-obama' : 0, 'donald-trump' : 1, 'hillary-clinton' : 2, 
                     'mitt-romney' : 3, 'scott-walker' : 4, 'john-mccain' : 5, 
                     'rick-perry' : 6, 'chain-email' : 7, 'marco-rubio' : 8, 'viral-image':13,
                     'rick-scott' : 9, 'ted-cruz' : 10, 'bernie-s' : 11, 'newt-gingrich':16,
                     'chris-christie' : 12, 'facebook-posts' : 13,'blog-posting':13, 
                     'charlie-crist' : 14, 'congressional' : 15, 'republican' : 15, 
                     'national-committe' : 15, 'democratic':15}

print (frequent_speakers)

def get_speaker_id(speaker):
  if isinstance(speaker, str):
    matched = [sp for sp in frequent_speakers if sp in speaker.lower() ]
    if len(matched)>0:
      return frequent_speakers[matched[0]]
    else:
      return len(set(frequent_speakers.values())) 
  else:
    return len(set(frequent_speakers.values())) 
  

train_data['speaker_id'] = train_data['speaker'].apply(get_speaker_id)
val_data['speaker_id'] = val_data['speaker'].apply(get_speaker_id)
test_data['speaker_id'] = test_data['speaker'].apply(get_speaker_id)

print (len(set(frequent_speakers.values()))) 

train_data['speaker_id'].value_counts()

{'barack-obama': 0, 'donald-trump': 1, 'hillary-clinton': 2, 'mitt-romney': 3, 'scott-walker': 4, 'john-mccain': 5, 'rick-perry': 6, 'chain-email': 7, 'marco-rubio': 8, 'viral-image': 13, 'rick-scott': 9, 'ted-cruz': 10, 'bernie-s': 11, 'newt-gingrich': 16, 'chris-christie': 12, 'facebook-posts': 13, 'blog-posting': 13, 'charlie-crist': 14, 'congressional': 15, 'republican': 15, 'national-committe': 15, 'democratic': 15}
17


17    7348
0     488 
15    347 
1     275 
2     239 
3     176 
13    156 
4     149 
5     148 
6     142 
7     142 
8     117 
9     115 
10    93  
11    88  
12    78  
14    70  
16    69  
Name: speaker_id, dtype: int64

In [9]:
frequent_jobs = { 'senator' : 0, 'president' : 1, 'governor' : 2, 
                 'u.s. representative' : 3, 'attorney' : 4, 'congressman' : 5, 
                 'congresswoman' : 5, 'social media posting' : 6, 'lawyer' : 4, 
                 'businessman' : 6,  'radio host' : 8, 'host':8,
                  'mayor' : 7, 'assembly' : 9,'representative' : 3, 
                 'senate' : 9,'state representative' : 10,'milwaukee county executive' : 11,
                 'u.s. house of representatives' : 3,'house representative' : 3,
                 'house of representatives' : 3,'house member':3}


print (frequent_jobs)

def get_job_id(job):
  if isinstance(job, str):
    matched = [jb for jb in frequent_jobs if jb in job.lower() ]
    if len(matched)>0:
      return frequent_jobs[matched[0]]
    else:
      return len(set(frequent_jobs.values()))
  else:
    return len(set(frequent_jobs.values()))
  

train_data['job_id'] = train_data['job'].apply(get_job_id)
val_data['job_id'] = val_data['job'].apply(get_job_id)
test_data['job_id'] = test_data['job'].apply(get_job_id)

print (len(set(frequent_jobs.values())))

train_data['job_id'].value_counts()

{'senator': 0, 'president': 1, 'governor': 2, 'u.s. representative': 3, 'attorney': 4, 'congressman': 5, 'congresswoman': 5, 'social media posting': 6, 'lawyer': 4, 'businessman': 6, 'radio host': 8, 'host': 8, 'mayor': 7, 'assembly': 9, 'representative': 3, 'senate': 9, 'state representative': 10, 'milwaukee county executive': 11, 'u.s. house of representatives': 3, 'house representative': 3, 'house of representatives': 3, 'house member': 3}
12


12    4597
1     1212
0     1201
3     911 
2     892 
8     279 
9     253 
5     232 
4     223 
7     167 
11    149 
6     124 
Name: job_id, dtype: int64

In [10]:
frequent_parties = train_data['party'].str.lower().value_counts()[:5].reset_index().to_dict()['index']
frequent_parties = dict((v,k) for k,v in frequent_parties.items())
print (frequent_parties)
#frequent_parties['columnist']=frequent_parties['journalist']
#frequent_parties['talk-show-host']=frequent_parties['journalist']
def get_party_id(party):
  if isinstance(party, str):
    matched = [pt for pt in frequent_parties if pt in party.lower() ]
    if len(matched)>0:
      return frequent_parties[matched[0]]
    else:
      return len(set(frequent_parties.values())) 
  else:
    return len(set(frequent_parties.values())) 
  

train_data['party_id'] = train_data['party'].apply(get_party_id)
val_data['party_id'] = val_data['party'].apply(get_party_id)
test_data['party_id'] = test_data['party'].apply(get_party_id)

print (len(set(frequent_parties.values())) )

train_data['party_id'].value_counts()

{'republican': 0, 'democrat': 1, 'none': 2, 'organization': 3, 'independent': 4}
5


0    4497
1    3337
2    1744
5    296 
3    219 
4    147 
Name: party_id, dtype: int64

In [11]:
train_data.loc[train_data['party_id']==9]['party'].value_counts()[:90]

Series([], Name: party, dtype: int64)

In [12]:
other_states = ['wyoming', 'colorado', 'hawaii', 'tennessee', 'nevada', 'maine',
                'north dakota', 'mississippi', 'south dakota', 'oklahoma', 
                'delaware', 'minnesota', 'north carolina', 'arkansas', 'indiana', 
                'maryland', 'louisiana', 'idaho', 'iowa', 'west virginia', 
                'michigan', 'kansas', 'utah', 'connecticut', 'montana', 'vermont', 
                'pennsylvania', 'alaska', 'kentucky', 'nebraska', 'new hampshire', 
                'missouri', 'south carolina', 'alabama', 'new mexico']


frequent_states = {'texas': 1, 'florida': 2, 'wisconsin': 3, 'new york': 4, 
                    'illinois': 5, 'ohio': 6, 'georgia': 7, 'virginia': 8, 
                   'rhode island': 9, 'oregon': 10, 'new jersey': 11, 
                   'massachusetts': 12, 'arizona': 13, 'california': 14, 
                   'washington': 15}
for state in other_states:
  frequent_states[state]=0

print (frequent_states)

def get_state_id(state):
    if isinstance(state,str):
        if state.lower().rstrip() in frequent_states:
            return frequent_states[state.lower().rstrip()]
        else:
            if 'washington' in state.lower():
                return frequent_states['washington']
            else:
                return len(set(frequent_states.values()))
    else:
        return len(set(frequent_states.values()))

train_data['state_id'] = train_data['state'].apply(get_state_id)
val_data['state_id'] = val_data['state'].apply(get_state_id)
test_data['state_id'] = test_data['state'].apply(get_state_id)

print (len(set(frequent_states.values())))

train_data['state_id'].value_counts()

{'texas': 1, 'florida': 2, 'wisconsin': 3, 'new york': 4, 'illinois': 5, 'ohio': 6, 'georgia': 7, 'virginia': 8, 'rhode island': 9, 'oregon': 10, 'new jersey': 11, 'massachusetts': 12, 'arizona': 13, 'california': 14, 'washington': 15, 'wyoming': 0, 'colorado': 0, 'hawaii': 0, 'tennessee': 0, 'nevada': 0, 'maine': 0, 'north dakota': 0, 'mississippi': 0, 'south dakota': 0, 'oklahoma': 0, 'delaware': 0, 'minnesota': 0, 'north carolina': 0, 'arkansas': 0, 'indiana': 0, 'maryland': 0, 'louisiana': 0, 'idaho': 0, 'iowa': 0, 'west virginia': 0, 'michigan': 0, 'kansas': 0, 'utah': 0, 'connecticut': 0, 'montana': 0, 'vermont': 0, 'pennsylvania': 0, 'alaska': 0, 'kentucky': 0, 'nebraska': 0, 'new hampshire': 0, 'missouri': 0, 'south carolina': 0, 'alabama': 0, 'new mexico': 0}
16


16    2238
0     1211
1     1009
2     1003
3     714 
4     659 
5     558 
6     448 
7     433 
8     408 
9     371 
10    242 
11    241 
12    212 
13    182 
14    163 
15    148 
Name: state_id, dtype: int64

In [13]:
frequent_subjects = {'health': 0, 'tax': 1, 'immigration': 2, 'election': 3, 
                     'education': 4, 'candidates-biography': 5, 'economy': 6, 
                     'gun': 7, 'job': 8, 'federal-budget': 6, 'energy': 9, 
                     'abortion': 10, 'foreign-policy': 6, 'state-budget': 6, 
                     'crime': 11, 'gays-and-lesbians' : 12, 'medicare' : 0, 
                     'terrorism' : 11, 'finance' : 6, 'criminal':11,
                     'transportation':13}

print (frequent_subjects)


def get_subject_id(subject):
  if isinstance(subject, str):
    matched = [sbj for sbj in frequent_subjects if sbj in subject.lower() ]
    if len(matched)>0:
      return frequent_subjects[matched[0]]
    else:
      return len(set(frequent_subjects.values())) 
  else:
    return len(set(frequent_subjects.values()))
  

train_data['subject_id'] = train_data['subject'].apply(get_subject_id)
val_data['subject_id'] = val_data['subject'].apply(get_subject_id)
test_data['subject_id'] = test_data['subject'].apply(get_subject_id)

print (len(set(frequent_subjects.values())))

train_data['subject_id'].value_counts()


{'health': 0, 'tax': 1, 'immigration': 2, 'election': 3, 'education': 4, 'candidates-biography': 5, 'economy': 6, 'gun': 7, 'job': 8, 'federal-budget': 6, 'energy': 9, 'abortion': 10, 'foreign-policy': 6, 'state-budget': 6, 'crime': 11, 'gays-and-lesbians': 12, 'medicare': 0, 'terrorism': 11, 'finance': 6, 'criminal': 11, 'transportation': 13}
14


6     2103
14    1909
0     1302
1     906 
4     621 
3     569 
5     512 
2     506 
11    438 
8     409 
9     305 
7     278 
10    170 
13    127 
12    85  
Name: subject_id, dtype: int64

In [14]:
frequent_venues = {'news release' : 0, 'interview' : 1, 'press release' : 2, 
                   'speech' : 3, 'tv' : 4, 'tweet' : 5, 'campaign' : 6, 
                   'television' : 4, 'debate' : 7, 'news conference' : 8, 
                   'facebook' : 5, 'press conference' : 8, 'radio' : 9, 
                   'e-mail' : 10, 'email' : 10, 'mail' : 10, 'social media' : 5,
                   'twitter' : 5, 'blog':11, 'article':11,'comment':12, 'web':11}

print (frequent_venues)


def get_venue_id(venue):
  if isinstance(venue, str):
    matched = [ven for ven in frequent_venues if ven in venue.lower() ]
    if len(matched)>0:
      return frequent_venues[matched[0]]
    else:
      return len(set(frequent_venues.values())) 
  else:
    return len(set(frequent_venues.values()))
  

train_data['venue_id'] = train_data['venue'].apply(get_venue_id)
val_data['venue_id'] = val_data['venue'].apply(get_venue_id)
test_data['venue_id'] = test_data['venue'].apply(get_venue_id)

print (len(set(frequent_venues.values())))

train_data['venue_id'].value_counts()

{'news release': 0, 'interview': 1, 'press release': 2, 'speech': 3, 'tv': 4, 'tweet': 5, 'campaign': 6, 'television': 4, 'debate': 7, 'news conference': 8, 'facebook': 5, 'press conference': 8, 'radio': 9, 'e-mail': 10, 'email': 10, 'mail': 10, 'social media': 5, 'twitter': 5, 'blog': 11, 'article': 11, 'comment': 12, 'web': 11}
13


13    2695
1     1752
3     1059
7     735 
6     679 
4     568 
11    529 
5     473 
10    356 
2     346 
12    337 
0     324 
8     249 
9     138 
Name: venue_id, dtype: int64

In [15]:
train_data.head()

Unnamed: 0,id,label,statement,subject,speaker,job,state,party,barely-true,false,...,mostly-true,pants-fire,venue,output,speaker_id,job_id,party_id,state_id,subject_id,venue_id
0,2635.json,false,Says the Annies List political group supports third-trimester abortions on demand.,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,...,0.0,0.0,a mailer,1,17,3,0,1,10,10
1,10540.json,half-true,When did the decline of coal start? It started when natural gas took off that started to begin in (President George W.) Bushs administration.,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,...,1.0,0.0,a floor speech.,3,17,12,1,8,8,3
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by voting to give George Bush the benefit of the doubt on Iran.""",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,...,163.0,9.0,Denver,4,0,1,1,5,6,13
3,1123.json,false,Health care reform legislation is likely to mandate free sex change surgeries.,health-care,blog-posting,,,none,7.0,19.0,...,5.0,44.0,a news release,1,13,12,2,16,0,0
4,9028.json,half-true,The economic turnaround started at the end of my term.,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,...,19.0,2.0,an interview on CNN,3,14,12,1,2,6,1


In [16]:
def load_statement_vocab_dict(train_data):
  vocabulary_dict = {}
  if not os.path.exists('vocabulary.p'):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train_data['statement'])
    vocabulary_dict = tokenizer.word_index
    print (len(vocabulary_dict))
    pickle.dump(vocabulary_dict, open( "vocabulary.p", "wb" ))
    print ('Created Vocabulary Dictionary...')
    print ('Saved Vocabulary Dictionary...')
  else:
    print ('Loading Vocabulary Dictionary...')
    vocabulary_dict = pickle.load(open("vocabulary.p", "rb" ))
  return vocabulary_dict


def preprocess_statement(statement):
  statement = [w for w in statement.split(' ') if w not in stopwords.words('english')]
  statement = ' '.join(statement)
  text = text_to_word_sequence(statement)
  val = [0] * 10
  val = [vocabulary_dict[t] for t in text if t in vocabulary_dict] 
  return val

vocabulary_dict = load_statement_vocab_dict(train_data)
train_data['word_id'] = train_data['statement'].apply(preprocess_statement)
val_data['word_id'] = val_data['statement'].apply(preprocess_statement)
test_data['word_id'] = test_data['statement'].apply(preprocess_statement)


Loading Vocabulary Dictionary...


In [17]:
pos_tags = {'ADJ': 'adjective', 'ADP': 'adposition', 'ADV': 'adverb', 
            'AUX': 'auxiliary verb', 'CONJ': 'coordinating conjunction', 
            'DET': 'determiner', 'INTJ': 'interjection', 'NOUN': 'noun', 
            'NUM': 'numeral', 'PART': 'particle', 'PRON': 'pronoun', 
            'PROPN': 'proper noun', 'PUNCT': 'punctuation', 'X': 'other', 
            'SCONJ': 'subord conjunction', 'SYM': 'symbol', 'VERB': 'verb'}

#pos_dict = {'ADJ' : 0, 'ADP' : 1, 'ADV' : 2, 'AUX' : 3, 'CONJ' : 4, 'DET' : 5, 
#            'INTJ' : 6, 'NOUN' : 7, 'NUM' : 8, 'PART' : 9, 'PRON' : 10, 'X' : 16,
#            'PROPN' : 11, 'PUNCT' : 12, 'SCONJ' : 13, 'SYM' : 14, 'VERB' : 15}

pos_dict = {'NOUN' : 0, 'VERB' : 1, 'ADP' : 2, 'PROPN' : 3, 'PUNCT' : 4, 
            'DET' : 5, 'ADJ' : 6, 'NUM' : 7, 'ADV' : 8, 'PRON' : 9, 'X' : 9, 
            'PART' : 9, 'SYM' : 9, 'INTJ' : 9 }

dep_dict = {'ACL' : 0, 'ACOMP' : 1, 'ADVCL' : 2, 'ADVMOD' : 3, 'AGENT' : 4, 
            'AMOD' : 5, 'APPOS' : 6, 'ATTR' : 7, 'AUX' : 8, 'AUXPASS' : 9, 
            'CASE' : 10, 'CC' : 11, 'CCOMP' : 12, 'COMPOUND' : 13, 'CONJ' : 14, 
            'CSUBJ' : 15, 'CSUBJPASS' : 16, 'DATIVE' : 17, 'DEP' : 18, 
            'DET' : 19, 'DOBJ' : 20, 'EXPL' : 21, 'INTJ' : 22, 'MARK' : 23, 
            'META' : 24, 'NEG' : 25, 'NOUNMOD' : 26, 'NPMOD' : 27, 'NSUBJ' : 28, 
            'NSUBJPASS' : 29, 'NUMMOD' : 30, 'OPRD' : 31, 'PARATAXIS' : 32, 
            'PCOMP' : 33, 'POBJ' : 34, 'POSS' : 35, 'PRECONJ' : 36, 'PREDET' : 37, 
            'PREP' : 38, 'PRT' : 39, 'PUNCT' : 40, 'QUANTMOD' : 41, 
            'RELCL' : 42, 'ROOT' : 43, 'XCOMP' : 44}

def get_pos(statement):
  doc = nlp(statement)
  taglist = []
  deplist = []
  for token in doc:
    taglist.append(pos_dict.get(token.pos_,max(pos_dict.values())))
    #deplist.append(token.dep_)
  return taglist
train_data['pos_id'] = train_data['statement'].apply(get_pos)
val_data['pos_id'] = val_data['statement'].apply(get_pos)
test_data['pos_id'] = test_data['statement'].apply(get_pos)

In [18]:
dep_dict = {'ACL' : 0, 'ACOMP' : 1, 'ADVCL' : 2, 'ADVMOD' : 3, 'AGENT' : 4, 
            'AMOD' : 5, 'APPOS' : 6, 'ATTR' : 7, 'AUX' : 8, 'AUXPASS' : 9, 
            'CASE' : 10, 'CC' : 11, 'CCOMP' : 12, 'COMPOUND' : 13, 'CONJ' : 14, 
            'CSUBJ' : 15, 'CSUBJPASS' : 16, 'DATIVE' : 17, 'DEP' : 18, 
            'DET' : 19, 'DOBJ' : 20, 'EXPL' : 21, 'INTJ' : 22, 'MARK' : 23, 
            'META' : 24, 'NEG' : 25, 'NOUNMOD' : 26, 'NPMOD' : 27, 'NSUBJ' : 28, 
            'NSUBJPASS' : 29, 'NUMMOD' : 30, 'OPRD' : 31, 'PARATAXIS' : 32, 
            'PCOMP' : 33, 'POBJ' : 34, 'POSS' : 35, 'PRECONJ' : 36, 'PREDET' : 37, 
            'PREP' : 38, 'PRT' : 39, 'PUNCT' : 40, 'QUANTMOD' : 41, 
            'RELCL' : 42, 'ROOT' : 43, 'XCOMP' : 44}


dep_dict = {'punct' : 0, 'prep' : 1, 'pobj' : 2, 'compound' : 3, 'det' : 4, 
            'nsubj' : 5, 'ROOT' : 6, 'amod' : 7, 'dobj' : 8, 'aux' : 9, 
            'advmod' : 10, 'nummod' : 10, 'ccomp' : 10, 'conj' : 10, 'cc' : 10, 
            'advcl' : 10, 'poss' : 10, 'mark' : 10, 'quantmod' : 10, 'relcl' : 10, 
            'attr' : 10, 'xcomp' : 10, 'npadvmod' : 10, 'nmod' : 10, 'auxpass' : 10, 
            'acl' : 10, 'nsubjpass' : 10, 'pcomp' : 10, 'acomp' : 10, 'neg' : 10, 
            'appos' : 10, 'prt' : 10, '' : 10, 'expl' : 10, 'dative' : 10, 
            'agent' : 10, 'case' : 10, 'oprd' : 10, 'csubj' : 10, 'dep' : 10, 
            'intj' : 10, 'predet' : 10, 'parataxis' : 10, 'preconj' : 10, 
            'meta' : 10, 'csubjpass' : 10}


def get_dep_parse(statement):
  doc = nlp(statement)
  deplist = []
  for token in doc:
    deplist.append(dep_dict.get(token.dep_, max(dep_dict.values())))
  return deplist


train_data['dep_id'] = train_data['statement'].apply(get_dep_parse)
val_data['dep_id'] = val_data['statement'].apply(get_dep_parse)
test_data['dep_id'] = test_data['statement'].apply(get_dep_parse)
train_data.head()

Unnamed: 0,id,label,statement,subject,speaker,job,state,party,barely-true,false,...,output,speaker_id,job_id,party_id,state_id,subject_id,venue_id,word_id,pos_id,dep_id
0,2635.json,false,Says the Annies List political group supports third-trimester abortions on demand.,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,...,1,17,3,0,1,10,10,"[7, 6968, 1141, 520, 621, 385, 444, 5119, 585, 1601]","[1, 5, 3, 3, 6, 0, 1, 6, 4, 0, 0, 2, 0, 4]","[6, 4, 3, 10, 7, 5, 10, 7, 0, 3, 8, 1, 2, 0]"
1,10540.json,half-true,When did the decline of coal start? It started when natural gas took off that started to begin in (President George W.) Bushs administration.,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,...,3,17,12,1,8,8,3,"[63, 2091, 964, 866, 23, 602, 1142, 315, 180, 602, 1959, 34, 310, 560, 1365, 177]","[8, 9, 5, 0, 2, 0, 0, 4, 9, 1, 8, 6, 0, 1, 2, 5, 1, 9, 1, 2, 4, 3, 3, 3, 4, 3, 0, 4]","[10, 6, 4, 5, 1, 3, 2, 0, 5, 6, 10, 7, 5, 10, 10, 5, 10, 9, 10, 1, 0, 3, 3, 2, 0, 3, 6, 0]"
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by voting to give George Bush the benefit of the doubt on Iran.""",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,...,4,0,1,1,5,6,13,"[127, 101, 3546, 191, 254, 20, 329, 343, 310, 166, 1093, 3547, 416]","[3, 3, 1, 2, 3, 3, 4, 2, 1, 9, 1, 3, 3, 5, 0, 2, 5, 0, 2, 3, 4, 4]","[3, 5, 6, 1, 3, 2, 0, 1, 10, 9, 10, 3, 10, 4, 8, 1, 4, 2, 1, 2, 0, 0]"
3,1123.json,false,Health care reform legislation is likely to mandate free sex change surgeries.,health-care,blog-posting,,,none,7.0,19.0,...,1,13,12,2,16,0,0,"[32, 43, 266, 298, 666, 667, 404, 467, 417, 4148]","[0, 0, 0, 0, 9, 6, 9, 1, 6, 0, 0, 0, 4]","[3, 3, 3, 5, 6, 10, 9, 10, 7, 3, 3, 8, 0]"
4,9028.json,half-true,The economic turnaround started at the end of my term.,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,...,3,14,12,1,2,6,1,"[1, 325, 4149, 602, 408, 505]","[5, 6, 0, 1, 2, 5, 0, 2, 5, 0, 4]","[4, 7, 5, 6, 1, 4, 2, 1, 10, 2, 0]"


In [19]:
embeddings = {}
with open("glove.6B.100d.txt", "br") as file_object:
  for line in file_object:
    word_embed = line.split()
    word = word_embed[0]
    embed = np.array(word_embed[1:], dtype="float32")
    embeddings[word.lower()]= embed

EMBED_DIM = 100
print (len(embeddings), " : Word Embeddings Found")
print (len(embeddings[word]), " : Embedding Dimension")


num_words = len(vocabulary_dict) + 1
embedding_matrix = np.zeros((num_words, EMBED_DIM))
for word, i in vocabulary_dict.items():
    embedding_vector = embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

embeddings_index = None



pos_embeddings = np.identity(max(pos_dict.values()), dtype=int)
dep_embeddings = np.identity(max(dep_dict.values()), dtype=int)

400000  : Word Embeddings Found
100  : Embedding Dimension


In [20]:

vocab_length = len(vocabulary_dict.keys())
hidden_size = EMBED_DIM #Has to be same as EMBED_DIM
lstm_size = 100
num_steps = 15
num_epochs = 30
batch_size = 40
kernel_sizes = [3,3,3]
filter_size = 128
num_party = len(train_data.party_id.unique())
num_state = len(train_data.state_id.unique())
num_venue = len(train_data.venue_id.unique())
num_job = len(train_data.job_id.unique())
num_sub = len(train_data.subject_id.unique())
num_speaker = len(train_data.speaker_id.unique())
print (num_party)
print (num_state)
print (num_venue)
print (num_job)
print (num_sub)
print (num_speaker)
print (train_data.columns)

6
17
14
12
15
18
Index(['id', 'label', 'statement', 'subject', 'speaker', 'job', 'state',
       'party', 'barely-true', 'false', 'half-true', 'mostly-true',
       'pants-fire', 'venue', 'output', 'speaker_id', 'job_id', 'party_id',
       'state_id', 'subject_id', 'venue_id', 'word_id', 'pos_id', 'dep_id'],
      dtype='object')


In [21]:
X_train = train_data['word_id']
X_val = val_data['word_id']
X_test = test_data['word_id']

Y_train = train_data['output']
Y_train = tensorflow.keras.utils.to_categorical(Y_train, num_classes=6)

Y_val = val_data['output']
Y_val = tensorflow.keras.utils.to_categorical(Y_val, num_classes=6)

X_train = sequence.pad_sequences(X_train, maxlen=num_steps, padding='post',truncating='post')
X_val = sequence.pad_sequences(X_val, maxlen=num_steps, padding='post',truncating='post')
X_test = sequence.pad_sequences(X_test, maxlen=num_steps, padding='post',truncating='post')

X_train_pos = train_data['pos_id']
X_val_pos = val_data['pos_id']
X_test_pos = test_data['pos_id']

X_train_pos = sequence.pad_sequences(X_train_pos, maxlen=num_steps, padding='post',truncating='post')
X_val_pos = sequence.pad_sequences(X_val_pos, maxlen=num_steps, padding='post',truncating='post')
X_test_pos = sequence.pad_sequences(X_test_pos, maxlen=num_steps, padding='post',truncating='post')


X_train_dep = train_data['dep_id']
X_val_dep = val_data['dep_id']
X_test_dep = test_data['dep_id']

X_train_dep = sequence.pad_sequences(X_train_dep, maxlen=num_steps, padding='post',truncating='post')
X_val_dep = sequence.pad_sequences(X_val_dep, maxlen=num_steps, padding='post',truncating='post')
X_test_dep = sequence.pad_sequences(X_test_dep, maxlen=num_steps, padding='post',truncating='post')



In [22]:
num_job=13
#Meta data preparation
job_train = tensorflow.keras.utils.to_categorical(train_data['job_id'], num_classes=num_job)
party_train = tensorflow.keras.utils.to_categorical(train_data['party_id'], num_classes=num_party)
state_train = tensorflow.keras.utils.to_categorical(train_data['state_id'], num_classes=num_state)
venue_train = tensorflow.keras.utils.to_categorical(train_data['venue_id'], num_classes=num_venue)


subject_train = tensorflow.keras.utils.to_categorical(train_data['subject_id'], num_classes=num_sub)
speaker_train = tensorflow.keras.utils.to_categorical(train_data['speaker_id'], num_classes=num_speaker)
#X_train_meta = party_train
X_train_meta = np.hstack((party_train, state_train, venue_train, job_train, subject_train, speaker_train))

party_val = tensorflow.keras.utils.to_categorical(val_data['party_id'], num_classes=num_party)
state_val = tensorflow.keras.utils.to_categorical(val_data['state_id'], num_classes=num_state)
venue_val = tensorflow.keras.utils.to_categorical(val_data['venue_id'], num_classes=num_venue)
job_val = tensorflow.keras.utils.to_categorical(val_data['job_id'], num_classes=num_job)
subject_val = tensorflow.keras.utils.to_categorical(val_data['subject_id'], num_classes=num_sub)
speaker_val = tensorflow.keras.utils.to_categorical(val_data['speaker_id'], num_classes=num_speaker)

#X_val_meta = party_val
X_val_meta = np.hstack((party_val, state_val, venue_val, job_val, subject_val, speaker_val))

party_test = tensorflow.keras.utils.to_categorical(test_data['party_id'], num_classes=num_party)
state_test = tensorflow.keras.utils.to_categorical(test_data['state_id'], num_classes=num_state)
venue_test = tensorflow.keras.utils.to_categorical(test_data['venue_id'], num_classes=num_venue)
job_test = tensorflow.keras.utils.to_categorical(test_data['job_id'], num_classes=num_job)
subject_test = tensorflow.keras.utils.to_categorical(test_data['subject_id'], num_classes=num_sub)
speaker_test = tensorflow.keras.utils.to_categorical(test_data['speaker_id'], num_classes=num_speaker)

#X_test_meta = party_test
X_test_meta = np.hstack((party_test, state_test, venue_test, job_test, subject_test, speaker_test))

In [23]:
print (X_train_meta.shape, X_val_meta.shape, X_test_meta.shape)
print (X_train.shape, X_val.shape, X_test.shape)
print (X_train_pos.shape, X_val_pos.shape, X_test_pos.shape)
print (X_train_dep.shape, X_val_dep.shape, X_test_dep.shape)
print (Y_train.shape, Y_val.shape)

print (X_train_dep, X_val_dep, X_test_dep)


(10240, 83) (1284, 83) (1267, 83)
(10240, 15) (1284, 15) (1267, 15)
(10240, 15) (1284, 15) (1267, 15)
(10240, 15) (1284, 15) (1267, 15)
(10240, 6) (1284, 6)
[[ 6  4  3 ...  2  0  0]
 [10  6  4 ...  5 10 10]
 [ 3  5  6 ... 10  4  8]
 ...
 [ 6  4  5 ... 10  0  9]
 [ 6 10  4 ...  0  0  0]
 [ 4  5  1 ...  3  8  1]] [[ 5  6  7 ...  0  0  0]
 [10 10 10 ...  3  8  0]
 [ 6 10  8 ... 10  1  4]
 ...
 [ 3  5 10 ...  3  2  0]
 [ 4  7  5 ... 10  3  8]
 [ 4  5  6 ...  4  3  2]] [[10  4  8 ...  0  0  0]
 [ 5  6  1 ...  0  0  0]
 [ 6  3  5 ...  0  0  0]
 ...
 [ 1  4  7 ...  7  5 10]
 [ 6  4  3 ... 10  5 10]
 [ 6  4  5 ...  0  5  9]]


In [24]:
vocab_length = len(vocabulary_dict.keys())
hidden_size = EMBED_DIM #Has to be same as EMBED_DIM
lstm_size = 100
num_steps = 15
num_epochs = 30
batch_size = 40

#Hyperparams for CNN
kernel_sizes = [3,3,3]
filter_size = 128

#Meta data related hyper params
num_party = len(train_data.party_id.unique())
num_state = len(train_data.state_id.unique())
num_venue = len(train_data.venue_id.unique())
num_job = len(train_data.job_id.unique())
num_sub = len(train_data.subject_id.unique())
num_speaker = len(train_data.speaker_id.unique())
print (num_party)
print (num_state)
print (num_venue)
print (num_job)
print (num_sub)
print (num_speaker)
print (train_data.columns)

6
17
14
12
15
18
Index(['id', 'label', 'statement', 'subject', 'speaker', 'job', 'state',
       'party', 'barely-true', 'false', 'half-true', 'mostly-true',
       'pants-fire', 'venue', 'output', 'speaker_id', 'job_id', 'party_id',
       'state_id', 'subject_id', 'venue_id', 'word_id', 'pos_id', 'dep_id'],
      dtype='object')


In [25]:
kernel_stmt = []
kernel_pos = []
kernel_dep = []

use_pos=False
use_meta=True
use_dep=True

statement_input = Input(shape=(num_steps,), dtype='int32', name='main_input')
x_stmt = Embedding(vocab_length+1,EMBED_DIM,input_length=num_steps)(statement_input) 

# pos embed LSTM
pos_input = Input(shape=(num_steps,), dtype='int32', name='pos_input')
x_pos = Embedding(max(pos_dict.values()), max(pos_dict.values()), input_length=num_steps)(pos_input)

# dep embed LSTM
dep_input = Input(shape=(num_steps,), dtype='int32', name='dep_input')
x_dep = Embedding(max(dep_dict.values()), max(dep_dict.values()), input_length=num_steps,)(dep_input)


for kernel in kernel_sizes:
    print(filter_size)
    x_1 = Conv1D(filters=filter_size,kernel_size=kernel)(x_stmt)
    x_1 = GlobalMaxPool1D()(x_1)
    kernel_stmt.append(x_1)
    
    x_2 = Conv1D(filters=filter_size,kernel_size=kernel)(x_pos)
    x_2 = GlobalMaxPool1D()(x_2)
    kernel_pos.append(x_2)
    
    x_3 = Conv1D(filters=filter_size,kernel_size=kernel)(x_dep)
    x_3 = GlobalMaxPool1D()(x_3)
    kernel_dep.append(x_3)
    
conv_in1 = tensorflow.keras.layers.concatenate(kernel_stmt)
conv_in1 = Dropout(0.6)(conv_in1)
conv_in1 = Dense(128, activation='relu')(conv_in1)

conv_in2 = tensorflow.keras.layers.concatenate(kernel_pos)
conv_in2 = Dropout(0.6)(conv_in2)
conv_in2 = Dense(128, activation='relu')(conv_in2)

conv_in3 = tensorflow.keras.layers.concatenate(kernel_dep)
conv_in3 = Dropout(0.6)(conv_in3)
conv_in3 = Dense(128, activation='relu')(conv_in3)

# meta data
meta_input = Input(shape=(X_train_meta.shape[1],), name='aux_input')
x_meta = Dense(64, activation='relu')(meta_input)

if use_pos and use_meta:
  if use_dep:
    x = tensorflow.keras.layers.concatenate([conv_in1, conv_in2, conv_in3, x_meta])
  else:
    x = tensorflow.keras.layers.concatenate([conv_in1, conv_in2, x_meta])
elif use_meta:
  if use_dep:
    x = tensorflow.keras.layers.concatenate([conv_in1, conv_in3, x_meta])
  else:
    x = tensorflow.keras.layers.concatenate([conv_in1, x_meta])
elif use_pos:
  if use_dep:
    x = tensorflow.keras.layers.concatenate([conv_in1, conv_in2, conv_in3])
  else:
    x = tensorflow.keras.layers.concatenate([conv_in1, conv_in2])
else:
  if use_dep:
    x = tensorflow.keras.layers.concatenate([conv_in1, conv_in3])
  else:
    x = conv_in1



main_output = Dense(6, activation='softmax', name='main_output')(x)

if use_pos and use_meta:
  if use_dep:
    model_cnn = Model(inputs=[statement_input, pos_input, dep_input, meta_input], outputs=[main_output])
  else:
    model_cnn = Model(inputs=[statement_input, pos_input, meta_input], outputs=[main_output])
elif use_meta:
  if use_dep:
    model_cnn = Model(inputs=[statement_input, dep_input, meta_input], outputs=[main_output])
  else:
    model_cnn = Model(inputs=[statement_input, meta_input], outputs=[main_output])
elif use_pos:
  if use_dep:
    model_cnn = Model(inputs=[statement_input, pos_input, dep_input], outputs=[main_output])
  else:
    model_cnn = Model(inputs=[statement_input, pos_input], outputs=[main_output])
else:
  if use_dep:
    model_cnn = Model(inputs=[statement_input, dep_input], outputs=[main_output])
  else:
    model_cnn = Model(inputs=[statement_input], outputs=[main_output])
    


128
128
128


In [26]:
print (model_cnn.summary())

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
main_input (InputLayer)         [(None, 15)]         0                                            
__________________________________________________________________________________________________
dep_input (InputLayer)          [(None, 15)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 15, 100)      1240900     main_input[0][0]                 
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 15, 10)       100         dep_input[0][0]                  
______________________________________________________________________________________________

In [54]:
def train(model, name, use_pos=False, use_meta=False, use_dep=False):
  sgd = optimizers.SGD(lr=0.025, clipvalue=0.3, nesterov=True)
  adam = optimizers.Adam(lr=0.000075, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
  model.compile(optimizer=sgd,loss='categorical_crossentropy',metrics=['categorical_accuracy'])
  tb = TensorBoard()
  csv_logger = tensorflow.keras.callbacks.CSVLogger('training.log')
  filepath= name+"_weights_best.hdf5"
  checkpoint = tensorflow.keras.callbacks.ModelCheckpoint(filepath, monitor='val_categorical_accuracy', 
                                             verbose=1, save_best_only=True, mode='max')
  if use_pos and use_meta:
    if use_dep:
      print('hi1')
      model.fit(
        {'main_input': X_train, 'pos_input': X_train_pos, 
         'aux_input': X_train_meta, 'dep_input': X_train_dep},
        {'main_output': Y_train}, epochs = num_epochs, batch_size = batch_size,
        validation_data = (
            {'main_input': X_val, 'pos_input': X_val_pos, 
             'aux_input': X_val_meta, 'dep_input' : X_val_dep},
            {'main_output': Y_val}
        ), callbacks=[tb,csv_logger,checkpoint])
    else:
      print('hi2')
      model.fit(
        {'main_input': X_train, 'pos_input': X_train_pos, 'aux_input': X_train_meta},
        {'main_output': Y_train}, epochs = num_epochs, batch_size = batch_size,
        validation_data = ({'main_input': X_val, 'pos_input': X_val_pos, 'aux_input': X_val_meta},
            {'main_output': Y_val}), callbacks=[tb,csv_logger,checkpoint])
  elif use_meta:
    if use_dep:
      print('hi3')
      model.fit(
        {'main_input': X_train, 'aux_input': X_train_meta,'dep_input':X_train_dep},
        {'main_output': Y_train}, epochs = num_epochs, batch_size = batch_size,
        validation_data = (
            {'main_input': X_val, 'aux_input': X_val_meta, 'dep_input': X_val_dep},
            {'main_output': Y_val}
        ), callbacks=[tb,csv_logger,checkpoint])
    else:
      model.fit(
        {'main_input': X_train, 'aux_input': X_train_meta},
        {'main_output': Y_train}, epochs = num_epochs, batch_size = batch_size,
        validation_data = (
            {'main_input': X_val, 'aux_input': X_val_meta},
            {'main_output': Y_val}
        ), callbacks=[tb,csv_logger,checkpoint])
  elif use_pos:
    if use_dep:
      
      model.fit(
        {'main_input': X_train, 'pos_input': X_train_pos,'dep_input':X_train_dep},
        {'main_output': Y_train}, epochs = num_epochs, batch_size = batch_size, validation_data = (
            {'main_input': X_val, 'pos_input': X_val_pos, 'dep_input':X_val_dep},
            {'main_output': Y_val}
        ), callbacks=[tb,csv_logger,checkpoint])
    else:
      model.fit(
        {'main_input': X_train, 'pos_input': X_train_pos},
        {'main_output': Y_train}, epochs = num_epochs, batch_size = batch_size,
        validation_data = (
            {'main_input': X_val, 'pos_input': X_val_pos},
            {'main_output': Y_val}
        ), callbacks=[tb,csv_logger,checkpoint])
  else:
    if use_dep:
      model.fit({'main_input': X_train,'dep_input':X_train_dep},
        {'main_output': Y_train}, epochs = num_epochs, batch_size = batch_size,
        validation_data = (
            {'main_input': X_val, 'dep_input':X_val_dep},
            {'main_output': Y_val}
        ), callbacks=[tb,csv_logger,checkpoint])
    else:
      model.fit( {'main_input': X_train},{'main_output': Y_train}, epochs = num_epochs, batch_size = batch_size, 
                validation_data = ({'main_input': X_val},{'main_output': Y_val}), callbacks=[tb,csv_logger,checkpoint])
    
  
  
def evaluate(name, use_pos=False, use_meta=False, use_dep=False):
  model1 = load_model(name+'_weights_best.hdf5')
  if use_pos and use_meta:
    if use_dep:
      preds = model1.predict([X_test,X_test_pos, X_test_dep, X_test_meta], batch_size=batch_size, verbose=1)
    else:
      preds = model1.predict([X_test,X_test_pos, X_test_meta], batch_size=batch_size, verbose=1)
  elif use_meta:
    if use_dep:
      preds = model1.predict([X_test, X_test_dep, X_test_meta], batch_size=batch_size, verbose=1)
    else:
      preds = model1.predict([X_test, X_test_meta], batch_size=batch_size, verbose=1)
  elif use_pos:
    if use_dep:
      preds = model1.predict([X_test, X_test_pos, X_test_dep], batch_size=batch_size, verbose=1)
    else:
      preds = model1.predict([X_test, X_test_pos], batch_size=batch_size, verbose=1)
  else:
    if use_dep:
      preds = model1.predict([X_test, X_test_dep], batch_size=batch_size, verbose=1)
    else:
      preds = model1.predict([X_test], batch_size=batch_size, verbose=1)
    
  false_worst = {}
  true_best = {}
  label_list = ['pants-fire','false','barely-true','half-true','mostly-true','true']

  Y_test_gt = list(test_data['output'])
  predictions = np.array([np.argmax(pred) for pred in preds])
  
  for p in range(len(preds)):
    if np.argmax(preds[p])==0:
      false_worst[p]=preds[p][0]
    elif np.argmax(preds[p])==5:
      true_best[p]=preds[p][5]
      
  print (len(predictions)==len(Y_test_gt))
  correct = np.sum(predictions == Y_test_gt)
  print ("Correctly Predicted : ", correct,"/",len(Y_test_gt))
  print ("Accuracy : ", correct*100.0/len(Y_test_gt))
  pickle.dump(predictions, open(name+'_predictions.p','wb'))
  return (false_worst, true_best)

In [55]:
train(model_cnn,'cnn', use_pos, use_meta, use_dep)

hi3
Train on 10240 samples, validate on 1284 samples
Epoch 1/30


InvalidArgumentError:  indices[0,4] = 10 is not in [0, 10)
	 [[node model/embedding_2/embedding_lookup (defined at <ipython-input-54-57c88bb5c904>:38) ]] [Op:__inference_distributed_function_20003]

Errors may have originated from an input operation.
Input Source operations connected to node model/embedding_2/embedding_lookup:
 model/embedding_2/embedding_lookup/19444 (defined at c:\users\naik9\appdata\local\programs\python\python37\lib\contextlib.py:112)

Function call stack:
distributed_function


In [None]:
(fw, tb) = evaluate('cnn', use_pos, use_meta, use_dep)
print_best_false_true_predicted(fw, tb)