In [1]:
#Imports
import numpy as np
import pandas as pd
import json
import re

from sklearn import utils
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

from gensim.models import Doc2Vec
import gensim
from gensim.models.doc2vec import TaggedDocument



In [2]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F

import re
import seaborn as sns
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'gensim'

In [2]:
#Load bill status data to obtain the status

with open('..\\Data\\107th-112th Congress\\Bill status\\HR_bill_status_contemporary.json') as f:
    HR_data1= json.load(f)

with open('..\\Data\\107th-112th Congress\\Bill status\\Sen_bill_status_contemporary.json') as f:
    Sen_data1= json.load(f)
    
with open('..\\Data\\113th-114th Congress\\Bill status\\HR_bill_status_modern.json') as f:
    HR_data2= json.load(f)

with open('..\\Data\\113th-114th Congress\\Bill status\\Sen_bill_status_modern.json') as f:
    Sen_data2= json.load(f)    
    

In [3]:
def extract_id_status(bill_list):
    v = {}
    for b in bill_list:
        v.update({b['bill_id']:b['status']})
    return v

In [4]:
#Bill id and status

d1 = extract_id_status(HR_data1)
d2 = extract_id_status(Sen_data1)
d3 = extract_id_status(HR_data2)
d4 = extract_id_status(Sen_data2)

#Concatenate dictionaries into one
status_dict = {}
for d in (d1,d2,d3,d4):
    status_dict.update(d)

In [5]:
status_dict

{'hr1-107': 'ENACTED:SIGNED',
 'hr10-107': 'ENACTED:SIGNED',
 'hr100-107': 'PASS_OVER:HOUSE',
 'hr1000-107': 'ENACTED:SIGNED',
 'hr1001-107': 'REFERRED',
 'hr1002-107': 'REFERRED',
 'hr1003-107': 'REFERRED',
 'hr1004-107': 'REFERRED',
 'hr1005-107': 'REFERRED',
 'hr1006-107': 'REFERRED',
 'hr1007-107': 'REPORTED',
 'hr1008-107': 'REFERRED',
 'hr1009-107': 'PASS_OVER:HOUSE',
 'hr101-107': 'REFERRED',
 'hr1010-107': 'REFERRED',
 'hr1011-107': 'REFERRED',
 'hr1012-107': 'REFERRED',
 'hr1013-107': 'REFERRED',
 'hr1014-107': 'REFERRED',
 'hr1015-107': 'REFERRED',
 'hr1016-107': 'REFERRED',
 'hr1017-107': 'REFERRED',
 'hr1018-107': 'REFERRED',
 'hr1019-107': 'REFERRED',
 'hr102-107': 'REFERRED',
 'hr1020-107': 'REPORTED',
 'hr1021-107': 'REFERRED',
 'hr1022-107': 'REPORTED',
 'hr1023-107': 'REFERRED',
 'hr1024-107': 'REFERRED',
 'hr1025-107': 'REFERRED',
 'hr1026-107': 'REFERRED',
 'hr1027-107': 'REFERRED',
 'hr1028-107': 'REFERRED',
 'hr1029-107': 'REFERRED',
 'hr103-107': 'REFERRED',
 'hr1

In [6]:
#Create dataframe from status_dict
status_df = pd.DataFrame.from_dict(status_dict,orient='index',columns=['Bill Status']).reset_index(level=0)

In [7]:
status_df.head()

Unnamed: 0,index,Bill Status
0,hr1-107,ENACTED:SIGNED
1,hr10-107,ENACTED:SIGNED
2,hr100-107,PASS_OVER:HOUSE
3,hr1000-107,ENACTED:SIGNED
4,hr1001-107,REFERRED


In [8]:
#Map status values to binary

#Once a Congress adjourn at the end of its two-year cycle, all bills that have been introduced in either
#the House or the Senate that have not made it through the entire legislative process and signed into law are dead.

#dictionary which specifies status value to binary
#0 = did not pass in originating chamber (firs pass)
#1 = did pass in originating chamber (first pass)
#survive committee in the originating chamber (doing this for more class balance)

status_binary_dict = {
    'INTRODUCED':0,
    'REFERRED':0,
    'REPORTED':1,
    'PROV_KILL:SUSPENSIONFAILED':1,
    'PROV_KILL:CLOTUREFAILED':1,
    'FAIL:ORIGINATING:HOUSE':1,
    'FAIL:ORIGINATING:SENATE':1,
    'PASSED:SIMPLERES':1,
    'PASSED:CONSTAMEND':1,
    'PASS_OVER:HOUSE':1,
    'PASS_OVER:SENATE':1,
    'PASSED:CONCURRENTRES':1,
    'FAIL:SECOND:HOUSE':1,
    'FAIL:SECOND:SENATE':1,
    'PASS_BACK:HOUSE':1,
    'PASS_BACK:SENATE':1,
    'PROV_KILL:PINGPONGFAIL':1,
    'PASSED:BILL':1,
    'CONFERENCE:PASSED:HOUSE':1,
    'CONFERENCE:PASSED:SENATE':1,
    'ENACTED:SIGNED':1,
    'PROV_KILL:VETO':1,
    'VETOED:POCKET':1,
    'VETOED:OVERRIDE_FAIL_ORIGINATING:HOUSE':1,
    'VETOED:OVERRIDE_FAIL_ORIGINATING:SENATE':1,
    'VETOED:OVERRIDE_PASS_OVER:HOUSE':1,
    'VETOED:OVERRIDE_PASS_OVER:SENATE':1,
    'VETOED:OVERRIDE_FAIL_SECOND:HOUSE':1,
    'VETOED:OVERRIDE_FAIL_SECOND:SENATE':1,
    'ENACTED:VETO_OVERRIDE':1,
    'ENACTED:TENDAYRULE':1,
    
}


In [9]:
#Use status_binary_dict to map values in dataframe
status_df["Bill Status"].replace(status_binary_dict, inplace=True)

In [10]:
status_df.head()

Unnamed: 0,index,Bill Status
0,hr1-107,1
1,hr10-107,1
2,hr100-107,1
3,hr1000-107,1
4,hr1001-107,0


In [11]:
status_df.shape

(79089, 2)

In [12]:
status_df['Bill Status'].unique()

array([1, 0], dtype=int64)

In [13]:
status_df['Bill Status'].sum()

11395

In [14]:
#Load bill text data

with open('..\\Data\\107th-112th Congress\\Bill text\\HR_text_to_114.json') as f:
    HR_data= json.load(f)
    
with open('..\\Data\\107th-112th Congress\\Bill text\\Sen_text_to_114.json') as f:
    Sen_data= json.load(f)

In [15]:
#Merge dicts into one
HR_data.update(Sen_data)

In [16]:
len(HR_data)

77565

In [17]:
#Text Preprocessing
def clean_text(text):
    #Remove underscores
    text = re.sub('\_','',text)
    return text

#Remove extended ellipses


In [18]:
clean_bill_text= {k:clean_text(v) for k,v in HR_data.items()}


In [19]:
#Read clustered data to dataframe
text_df = pd.DataFrame.from_dict(clean_bill_text,orient='index', columns=['Text'])
text_df.reset_index(level=0,inplace=True)

In [20]:
text_df.head()

Unnamed: 0,index,Text
0,107hr1ih,a bill to close the achievement gap with acco...
1,107hr10ih,"to provide for pension reform, and for other ..."
2,107hr100ih,to establish and expand programs relating to ...
3,107hr1000ih,to adjust the boundary of the william howard ...
4,107hr1001ih,to amend title xix of the social security act...


In [21]:
def rename_id(i):
    j=i[:-2]
    k = j[3:]+'-'+j[:3]
    return k

In [22]:
#Rename bill id from dataframe so that it matches dictionary of id and status

text_df['index']=text_df['index'].apply(lambda x: rename_id(x))

In [23]:
text_df.tail()

Unnamed: 0,index,Text
77560,s995-114,to establish congressional trade negotiating ...
77561,s996-114,to facilitate nationwide availability of volu...
77562,s997-114,to extend the authorization for the major med...
77563,s998-114,to establish a process for the consideration ...
77564,s999-114,to amend the small business act to provide fo...


In [24]:
#merge datasets

merged_text = pd.merge(text_df,status_df, on='index',how='inner')

In [25]:
merged_text.shape

(77564, 3)

In [26]:
text_df.shape

(77565, 2)

In [27]:
merged_text.head()

Unnamed: 0,index,Text,Bill Status
0,hr1-107,a bill to close the achievement gap with acco...,1
1,hr10-107,"to provide for pension reform, and for other ...",1
2,hr100-107,to establish and expand programs relating to ...,1
3,hr1000-107,to adjust the boundary of the william howard ...,1
4,hr1001-107,to amend title xix of the social security act...,0


# Bag of Words

In [28]:
X = merged_text['Text']
y = merged_text['Bill Status']

In [33]:
#get the text column 
documents=merged_text['Text'].tolist()
 

# Create a corpus

In [38]:
#Need to remove stop words and tokenize

# split into words
from nltk.tokenize import word_tokenize
tokens =[word_tokenize(doc) for doc in documents]

# remove punctuation from each word
import string
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]

# remove remaining tokens that are not alphabetic
words = [word for word in stripped if word.isalpha()]

# filter out stop words
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]

In [None]:
print(words[:100])

In [34]:
# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
          for document in documents]

# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1]
             for text in texts]

In [37]:
print(texts)

In [30]:
#create a vocabulary of words, 
#ignore words that appear in 85% of documents, 
#eliminate stop words
cv=CountVectorizer(max_df=0.85,stop_words='english',token_pattern=r'\b[^\d\W]+\b',analyzer='word')
word_count_vector=cv.fit_transform(docs)

In [106]:
# Create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\b[^\d\W]+\b', stop_words='english')

count_vect.fit(X)
# examine the fitted vocabulary
count_vect.get_feature_names()

['aa',
 'aaa',
 'aaaa',
 'aaaaa',
 'aaas',
 'aace',
 'aaces',
 'aacn',
 'aadt',
 'aafp',
 'aageson',
 'aahca',
 'aai',
 'aaj',
 'aak',
 'aamodt',
 'aan',
 'aand',
 'aaniih',
 'aaniiih',
 'aap',
 'aapcc',
 'aapi',
 'aapis',
 'aar',
 'aarhus',
 'aaron',
 'aarp',
 'aas',
 'aasf',
 'aasia',
 'aatcc',
 'aau',
 'aawv',
 'ab',
 'aba',
 'ababa',
 'abacha',
 'abad',
 'abalone',
 'abandon',
 'abandoned',
 'abandoning',
 'abandonment',
 'abandonments',
 'abandons',
 'abate',
 'abated',
 'abatement',
 'abatements',
 'abates',
 'abating',
 'abbas',
 'abbasid',
 'abbeville',
 'abbey',
 'abbiatico',
 'abbot',
 'abbots',
 'abbott',
 'abbottabad',
 'abbreviate',
 'abbreviated',
 'abbreviation',
 'abbreviations',
 'abc',
 'abcorps',
 'abcs',
 'abd',
 'abdel',
 'abdicate',
 'abdicated',
 'abdicating',
 'abdication',
 'abdomen',
 'abdominal',
 'abdominalis',
 'abdoul',
 'abduct',
 'abducted',
 'abductees',
 'abducting',
 'abduction',
 'abductions',
 'abductor',
 'abductors',
 'abducts',
 'abdul',
 'abdula

In [110]:
# Stratified Train/Test split

stratified_split = StratifiedShuffleSplit(n_splits=2, test_size=0.3)
for train_index, test_index in stratified_split.split(X, y):
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

# transform matrix of plots into lists to pass to a TfidfVectorizer
train_x = [x[0].strip() for x in x_train.tolist()]
test_x = [x[0].strip() for x in x_test.tolist()]

In [112]:
# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(test_x)
print(xtrain_count)

# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{4,}', max_features=5000, stop_words='english')
tfidf_vect.fit(X)

xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(test_x)




In [31]:

from gensim.models import TfidfModel
from gensim.corpora import Dictionary

# dct = Dictionary(X)  # fit dictionary
# corpus = [dct.doc2bow(line) for line in X]  # convert corpus to BoW format
# model = TfidfModel(corpus)  # fit model
# vector = model[corpus[0]]  # apply model to the first corpus document


# How to create a dictionary from a list of sentences?
# Tokenize(split) the sentences into words
texts = [[text for text in doc.split()] for doc in docs]

# Create dictionary
dictionary = Dictionary(texts)

# Get information about the dictionary
print(dictionary)


Dictionary(410351 unique tokens: ['$1,000,000', '$1,470,000,000', '$10,000', '$10,000,000', "$10,000,000''."]...)


In [32]:
corpus = [dictionary.doc2bow(line) for line in X]  # convert corpus to BoW format
model = TfidfModel(corpus)  # fit model
vector = model[corpus[0]]  # apply model to the first corpus document

TypeError: doc2bow expects an array of unicode tokens on input, not a single string

In [None]:
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
documents = [tokenize(reuters.raw(file_id)) for file_id in reuters.fileids()]
dictionary = Dictionary(documents)
 

tfidf_model = TfidfModel([dictionary.doc2bow(d) for d in documents], id2word=dictionary)
tfidf_values = dict(tfidf_model[dictionary.doc2bow(tokenize(reuters.raw('test/14829')))])

In [None]:
#Count Vectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

In [None]:
#TF-IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

In [None]:
#Naive Bayes

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('clf', OneVsRestClassifier(MultinomialNB(
        fit_prior=True, class_prior=None))),
])
parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'clf__estimator__alpha': (1e-2, 1e-3)
}

In [None]:
from sklearn.cross_validation import train_test_split
# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.3)

In [None]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    return metrics.accuracy_score(predictions, valid_y)

In [None]:
#Naive Bayes

# Naive Bayes on Count Vectors
accuracy = train_model(MultinomialNB(), xtrain_count, train_y, xvalid_count)
print("NB, Count Vectors: ", accuracy)

# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print("NB, Word TF-IDF: ", accuracy)

In [16]:
t='the amendments made_by this section_ shall apply.to dispositions..after the date .... of the enactment of this act. ", "107hr1236ih": " to amend the tariff suspension and trade act of 2000 to provide for the permanent designation of the san antonio international airport as an airport at which certain private aircraft arriving in the united states may land for processing. be it enacted by the senate and house of representatives of the united states of america in congress assembled, section 1. designation of san antonio international airport for customs processing of certain private aircraft arriving in the united states. section 1453 of the tariff suspension and trade act of 2000 is amended by striking for the 2-year period beginning on the date of the enactment of this act, the'' and inserting the''. ", "107hr1237ih": " to designate certain lands in the valley forge national historical park as the valley forge national cemetery. be it enacted by the senate and house of representatives of the united states of america in congress assembled, section 1. designation of lands as valley forge national cemetery.  in general. not more than 200 acres of land located within the valley forge national historical park on the day before the date of the enactment of this act are hereby designated as the valley forge national cemetery. administrative jurisdiction over such lands is hereby transferred to the secretary of veterans affairs and such lands shall be administered as a national cemetery in accordance with chapter 24 of title 38, united states code (relating to national cemeteries and memorials).  adjustment of park boundaries. subsection  of section 2 of the act entitled an act to authorize the secretary of the interior to establish the valley forge national historical park in the commonwealth of pennsylvania, and for other purposes'' (16 u.s.c. 410aa-1) is amended by striking map entitled valley forge national historical park, dated june 1979, and numbered vf-91,001 and inserting map entitled valley forge national historical park, dated ____, and numbered ____'

In [19]:
re.sub('\.{2,}','',t)

'the amendments made_by this section_ shall apply.to dispositionsafter the date  of the enactment of this act. ", "107hr1236ih": " to amend the tariff suspension and trade act of 2000 to provide for the permanent designation of the san antonio international airport as an airport at which certain private aircraft arriving in the united states may land for processing. be it enacted by the senate and house of representatives of the united states of america in congress assembled, section 1. designation of san antonio international airport for customs processing of certain private aircraft arriving in the united states. section 1453 of the tariff suspension and trade act of 2000 is amended by striking for the 2-year period beginning on the date of the enactment of this act, the and inserting the. ", "107hr1237ih": " to designate certain lands in the valley forge national historical park as the valley forge national cemetery. be it enacted by the senate and house of representatives of the un

In [None]:
#Read metadata to obtain the class labels






In [None]:
#Remove stop words


In [None]:
#Stemming


In [None]:
# Stratified Train/Test split


stratified_split = StratifiedShuffleSplit(n_splits=2, test_size=0.33)
for train_index, test_index in stratified_split.split(data_x, data_y):
    x_train, x_test = data_x[train_index], data_x[test_index]
    y_train, y_test = data_y[train_index], data_y[test_index]

# transform matrix of plots into lists to pass to a TfidfVectorizer
train_x = [x[0].strip() for x in x_train.tolist()]
test_x = [x[0].strip() for x in x_test.tolist()]

# Bag of Words model
***

In [None]:
#Count Vectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

In [None]:
#TF-IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

In [None]:
#Naive Bayes

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('clf', OneVsRestClassifier(MultinomialNB(
        fit_prior=True, class_prior=None))),
])
parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'clf__estimator__alpha': (1e-2, 1e-3)
}

In [None]:
#Logistic Regression

clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
...                      ('tfidf', TfidfTransformer()),
...                      ('clf', MultinomialNB()),
... ])

text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

In [None]:
#Performance on test set

twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

In [None]:
#Linear SVM


>>> text_clf_svm = Pipeline([('vect', CountVectorizer()),
...                      ('tfidf', TfidfTransformer()),
...                      ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',
...                                            alpha=1e-3, n_iter=5, random_state=42)),
... ])
>>> _ = text_clf_svm.fit(twenty_train.data, twenty_train.target)
>>> predicted_svm = text_clf_svm.predict(twenty_test.data)
>>> np.mean(predicted_svm == twenty_test.target)

In [None]:
#Grid Search

>>> parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
...               'tfidf__use_idf': (True, False),
...               'clf__alpha': (1e-2, 1e-3),
... }

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)

gs_clf.best_score_
gs_clf.best_params_



In [None]:
grid_search_tune = GridSearchCV(
    pipeline, parameters, cv=2, n_jobs=2, verbose=3)
grid_search_tune.fit(train_x, train_y)

print
print("Best parameters set:")
print grid_search_tune.best_estimator_.steps
print

# measuring performance on test set
print "Applying best classifier on test data:"
best_clf = grid_search_tune.best_estimator_
predictions = best_clf.predict(test_x)

print classification_report(test_y, predictions, target_names=genres)

In [None]:
>>> from sklearn.model_selection import GridSearchCV
>>> parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)],
...               'tfidf__use_idf': (True, False),
...               'clf-svm__alpha': (1e-2, 1e-3),
... }
gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(twenty_train.data, twenty_train.target)
gs_clf_svm.best_score_
gs_clf_svm.best_params_

In [None]:
#Performance






# Word embeddings model
***

In [None]:
#Train a word embedding network

In [1]:
import multiprocessing
cores = multiprocessing.cpu_count()

In [2]:
cores

12

In [None]:
#Build a vocabulary

model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

In [None]:
#Train doc2vec

%%time
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

In [None]:
#Build final feature vector for classifier

def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressorsdef vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [None]:
#Train Logistic Regression Classifier

y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
from sklearn.metrics import accuracy_score, f1_score
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))



In [None]:
# load the pre-trained word-embedding vectors 
embeddings_index = {}
for i, line in enumerate(open('data/wiki-news-300d-1M.vec')):
    values = line.split()
    embeddings_index[values[0]] = numpy.asarray(values[1:], dtype='float32')

# create a tokenizer 
token = text.Tokenizer()
token.fit_on_texts(trainDF['text'])
word_index = token.word_index

# convert text to sequence of tokens and pad them to ensure equal length vectors 
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen=70)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x), maxlen=70)

# create token-embedding mapping
embedding_matrix = numpy.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Word Embedding + Convolutional Neural Network
***

In [None]:
class CNN(nn.Module):

    def __init__(self, batch_size, output_size, in_channels, out_channels, kernel_heights, stride, padding, keep_probab, vocab_size, embedding_length, weights):

        super(CNN, self).__init__()


		"""

		Arguments

		---------

		batch_size : Size of each batch which is same as the batch_size of the data returned by the TorchText BucketIterator

		output_size : 2 = (pos, neg)

		in_channels : Number of input channels. Here it is 1 as the input data has dimension = (batch_size, num_seq, embedding_length)

		out_channels : Number of output channels after convolution operation performed on the input matrix

		kernel_heights : A list consisting of 3 different kernel_heights. Convolution will be performed 3 times and finally results from each kernel_height will be concatenated.

		keep_probab : Probability of retaining an activation node during dropout operation

		vocab_size : Size of the vocabulary containing unique words

		embedding_length : Embedding dimension of GloVe word embeddings

		weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table

		--------

		

		"""

        self.batch_size = batch_size
        self.output_size = output_size
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_heights = kernel_heights
        self.stride = stride
        self.padding = padding
        self.vocab_size = vocab_size
        self.embedding_length = embedding_length


        self.word_embeddings = nn.Embedding(vocab_size, embedding_length)
        self.word_embeddings.weight = nn.Parameter(weights, requires_grad=False)
        self.conv1 = nn.Conv2d(in_channels, out_channels, (kernel_heights[0], embedding_length), stride, padding)
        self.conv2 = nn.Conv2d(in_channels, out_channels, (kernel_heights[1], embedding_length), stride, padding)
        self.conv3 = nn.Conv2d(in_channels, out_channels, (kernel_heights[2], embedding_length), stride, padding)
        self.dropout = nn.Dropout(keep_probab)
        self.label = nn.Linear(len(kernel_heights)*out_channels, output_size)

    def conv_block(self, input, conv_layer):

        conv_out = conv_layer(input)# conv_out.size() = (batch_size, out_channels, dim, 1)
        activation = F.relu(conv_out.squeeze(3))# activation.size() = (batch_size, out_channels, dim1)
        max_out = F.max_pool1d(activation, activation.size()[2]).squeeze(2)# maxpool_out.size() = (batch_size, out_channels)

        return max_out


    def forward(self, input_sentences, batch_size=None):


		"""

		The idea of the Convolutional Neural Netwok for Text Classification is very simple. We perform convolution operation on the embedding matrix 

		whose shape for each batch is (num_seq, embedding_length) with kernel of varying height but constant width which is same as the embedding_length.

		We will be using ReLU activation after the convolution operation and then for each kernel height, we will use max_pool operation on each tensor 

		and will filter all the maximum activation for every channel and then we will concatenate the resulting tensors. This output is then fully connected

		to the output layers consisting two units which basically gives us the logits for both positive and negative classes.

		

		Parameters

		----------

		input_sentences: input_sentences of shape = (batch_size, num_sequences)

		batch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1)

		

		Returns

		-------

		Output of the linear layer containing logits for pos & neg class.

		logits.size() = (batch_size, output_size)

		

		"""


        input = self.word_embeddings(input_sentences)
        # input.size() = (batch_size, num_seq, embedding_length)
        input = input.unsqueeze(1)
        # input.size() = (batch_size, 1, num_seq, embedding_length)
        max_out1 = self.conv_block(input, self.conv1)
        max_out2 = self.conv_block(input, self.conv2)
        max_out3 = self.conv_block(input, self.conv3)


        all_out = torch.cat((max_out1, max_out2, max_out3), 1)
        # all_out.size() = (batch_size, num_kernels*out_channels)

        fc_in = self.dropout(all_out)

        # fc_in.size()) = (batch_size, num_kernels*out_channels)

        logits = self.label(fc_in)



        return logits

# Conclusion
***

# Predictions on new data (115th Congress)
***