In [181]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pickle
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.utils import shuffle
import random as rand
from nltk import word_tokenize

import keras.backend as K
import tensorflow as tf

from keras.models import Model
from keras.layers import Input, Dense, Lambda, Dropout
from keras.utils import to_categorical

# Brown dataset: vectorization using doc2vec

In [27]:
brown_disc_df = pd.read_pickle('data/discourse_markers/brown_disc_df.pkl')

## train doc2vec model

In [65]:
X_tokens = []
for idx, row in brown_disc_df.iterrows():
    if type(row['clean']) == float:
        X_tokens.append(row['sent'])
    else:
        X_tokens.append(row['clean'])

In [None]:
tagged_data = [TaggedDocument(words = sent, tags=[str(i)]) for i, sent in enumerate(X_tokens)]
# should I have lowered? decided not to for now

In [None]:
d2v_brown = Doc2Vec(vector_size = 50, min_count = 1, dm = 1)
d2v_brown.build_vocab(tagged_data)

In [None]:
d2v_brown.train(tagged_data, total_examples = d2v_brown.corpus_count, epochs = 20)
print('training finished')
d2v_brown.save("data/discourse_markers/d2v_brown.model")
print("trained & saved")

## add vectorized texts to df

In [64]:
vecs = []
null = []

for idx, row in tqdm(brown_disc_df.iterrows(), total = len(brown_disc_df)):
    vecs.append(d2v.docvecs[str(idx)])
    if type(row.clean) == float:
        null.append(1)
    else:
        null.append(0)
        
brown_disc_df['vec'] = vecs
brown_disc_df['null'] = null




ValueError: Length of values does not match length of index

In [79]:
brown_disc_df.to_pickle('data/discourse_markers/brown_disc_df.pkl')

## create X and y for model training

In [82]:
num_samples = len(brown_disc_df)
feature_dict = {}
idx = 0
for x in brown_disc_df.columns:
    if x in ['sent', 'vec', 'clean']:
        continue
    else:
        feature_dict[idx] = x
        idx += 1
num_features = idx
assert num_features == 60

In [88]:
y = np.zeros([num_samples])
for idx in feature_dict:
    y[idx] = brown_disc_df[feature_dict[idx]]

In [96]:
X = np.zeros([num_samples, 50]) # 50 is vector_size
vecs = brown_disc_df.vec
for i in range(num_samples):
    X[i] = vecs[i]

In [106]:
with open('data/discourse_markers/feature_dict', 'wb') as f:
    pickle.dump(feature_dict, f)

In [98]:
with open('data/discourse_markers/X_50dim_d2v.pkl', 'wb') as f:
    pickle.dump(X, f)
with open('data/discourse_markers/y.pkl', 'wb') as f:
    pickle.dump(y, f)

## balanced dataset (part)

In [None]:
brown_disc_df = pd.read_pickle('data/discourse_markers/brown_disc_df.pkl')

In [233]:
def balance(item):
    num_per_cat = sum(brown_disc_df[item])
    print(str(num_per_cat) + ' instances of ' + item + ' in the dataset')
    balanced_X = np.zeros([num_per_cat * 2, 50])
    balanced_y = np.zeros([num_per_cat * 2])
    
    pos = brown_disc_df[brown_disc_df[item] == 1]
    neg = brown_disc_df[brown_disc_df[item] == 0].sample(num_per_cat, random_state = 42)
    df = pd.concat([pos, neg]).reset_index()
    
    for idx, row in df.iterrows():
        balanced_X[idx, :] = row['vec']
        balanced_y[idx] = row[item]
    
    return shuffle(balanced_X, to_categorical(balanced_y, num_classes = 2), random_state = 42)

In [234]:
X_train, y_train = balance('So')

230 instances of So in the dataset


## balanced-ish dataset (whole)

In [2]:
with open('data/discourse_markers/X_50dim_d2v.pkl', 'rb') as f:
    X = pickle.load(f)
with open('data/discourse_markers/y.pkl', 'rb') as f:
    y = pickle.load(f)

In [99]:
# balance for number of not-clean = number of clean

# number of non-false items
num_true = 4000 #sum([sum(x) != 0 for x in y])

balanced_X = np.zeros([num_true * 2, X.shape[1]])
balanced_y = np.zeros([num_true * 2, y.shape[1]])

tr_idx = 0
fa_idx = 0
overall_idx = 0

for idx in range(len(X)):
    if sum(y[idx]) > 1:
        continue
    if tr_idx < num_true and sum(y[idx]) != 0:
        balanced_X[overall_idx, :] = X[idx]
        balanced_y[overall_idx, :] = y[idx]
        tr_idx += 1
        overall_idx += 1
    if fa_idx < num_true and sum(y[idx]) == 0:
        balanced_X[overall_idx, :] = X[idx]
        balanced_y[overall_idx, :] = y[idx]
        fa_idx += 1
        overall_idx += 1

In [100]:
X_train, X_test, y_train, y_test = train_test_split(balanced_X, balanced_y, test_size=0.1, random_state=42)

In [129]:
class_weights = class_weight.compute_class_weight('balanced', np.unique(np.argmax(y, axis=1)), np.argmax(y, axis=1))

# keras model

In [235]:
batch_size = 32
num_units = 128
vector_size = X_train.shape[1] # 50
num_outputs = y_train.shape[1] # 60

In [239]:
K.clear_session()

In [240]:
input_layer = Input(shape = (vector_size,), dtype = 'float32', name = 'input_layer')
dense_1 = Dense(num_units, activation = 'relu', name = 'dense_1')(input_layer)
dropout_1 = Dropout(0.5)(dense_1)
dense_2 = Dense(num_units, activation = 'relu', name = 'dense_2')(dropout_1)
dropout_2 = Dropout(0.5)(dense_2)
output = Dense(num_outputs, activation = 'softmax', name = 'output_layer')(dropout_2)

model = Model(inputs = input_layer, outputs = output)

model.compile(optimizer = 'adam',
             loss = 'categorical_crossentropy',
             metrics = ['accuracy'])

In [241]:
history = model.fit(X_train, y_train, 
                    validation_split = 0.2, epochs = 10, batch_size = 32, verbose = 2)
#                   class_weight = class_weights)

Train on 368 samples, validate on 92 samples
Epoch 1/10
 - 0s - loss: 0.7001 - acc: 0.5000 - val_loss: 0.6891 - val_acc: 0.5326
Epoch 2/10
 - 0s - loss: 0.6899 - acc: 0.5326 - val_loss: 0.6895 - val_acc: 0.4783
Epoch 3/10
 - 0s - loss: 0.6684 - acc: 0.6168 - val_loss: 0.6897 - val_acc: 0.5217
Epoch 4/10
 - 0s - loss: 0.6829 - acc: 0.5625 - val_loss: 0.6884 - val_acc: 0.5000
Epoch 5/10
 - 0s - loss: 0.6765 - acc: 0.5707 - val_loss: 0.6852 - val_acc: 0.5000
Epoch 6/10
 - 0s - loss: 0.6703 - acc: 0.5951 - val_loss: 0.6867 - val_acc: 0.5543
Epoch 7/10
 - 0s - loss: 0.6679 - acc: 0.5679 - val_loss: 0.6866 - val_acc: 0.5652
Epoch 8/10
 - 0s - loss: 0.6565 - acc: 0.6277 - val_loss: 0.6860 - val_acc: 0.5326
Epoch 9/10
 - 0s - loss: 0.6606 - acc: 0.5870 - val_loss: 0.6869 - val_acc: 0.5109
Epoch 10/10
 - 0s - loss: 0.6528 - acc: 0.6168 - val_loss: 0.6895 - val_acc: 0.5326


In [135]:
model.evaluate(X_test, y_test, verbose=2)

[0.2539236021786928, 0.48375]

## work with predictions

In [None]:
brown_disc_df = pd.read_pickle('data/discourse_markers/brown_disc_df.pkl')

In [None]:
with open('data/discourse_markers/feature_dict', 'wb') as f:
    feature_dict = pickle.load(f)

In [29]:
d2v = Doc2Vec.load("data/discourse_markers/d2v_brown.model")

In [227]:
def pred(sent):
    global s
    print('original:\t' + sent)
    test = word_tokenize(sent)
    vec = d2v.infer_vector(test)
    #result = feature_dict[np.argmax(model.predict(np.array([vec],)))]
    result = np.argmax(model.predict(np.array([vec],)))
    if result == 1:
        result = 'And'
    else:
        result = 'None'
    print('result:\t\t' + str(result))

In [231]:
for row in brown_disc_df[brown_disc_df['And'] == 1].clean:
    pred(' '.join(row))
    print()

original:	After several correspondents went into Pathet Lao territory and exposed the huge build-up , administration spokesmen acclaimed them for performing a `` great service '' and laid the matter before the Southeast Asia Treaty Organization .
result:		None

original:	The election of President Kennedy has attracted new attention to the ethical climate of his home state .
result:		None

original:	Then , some churchmen remarked , there is a more classical church-state problem :
result:		None

original:	There has been no effort since the election to pull it back together .
result:		None

original:	He caused the fumble that set up our touchdown .
result:		None

original:	So it was over the weekend what with 40-year-old Warren Spahn pitching his no-hit masterpiece against the Giants and the Giants' Willie Mays retaliating with a record-tying 4-homer spree Sunday .
result:		None

original:	One of the Milwaukee rookies sighed and remarked , `` Wish I was 40 , and a top-grade big leaguer .


result:		And

original:	The pitching will also have trouble doing better .
result:		None

original:	Walker looks stronger , seems to be throwing better than he did last year .
result:		None

original:	They're confident that the GOP , currently assailed by dissensions within the ranks , will be impressed by the purring power beneath the hood of this grassroots-fueled machine .
result:		And

original:	-- Proposals for a whole series of lesser candidate-picking conventions in the state's 38 new Congressional districts .
result:		And

original:	Now , for Communist listeners and readers :
result:		None

original:	`` Jesus answering said unto him , Suffer it to be so now : for thus it becometh us to fulfill all righteousness .
result:		None

original:	`` Jesus , when he was baptized went up straightway out of the water : and lo , the heavens were opened unto him , and he saw the Spirit of God descending like a dove , and lighting upon him '' .
result:		None

original:	He will avoid eye-strai

result:		And

original:	He makes many interesting comments .
result:		None

original:	Wwrl's colorful mobile unit , cruising predominately Negro neighborhoods , is a frequent reminder of that station's round-the-clock dedication to nonwhite interests .
result:		And

original:	Many advertisers have been happy with the results of letting a Negro disc jockey phrase the commercial in his own words , working only from a fact sheet .
result:		None

original:	Presentation of `` The Life Times Of John Sloan '' in the Delaware Art Center here suggests a current nostalgia for human values in art .
result:		None

original:	One of the most appealing of the rooftop canvases is `` Sun Wind On The Roof '' , with a woman and child bracing themselves against flapping clothes and flying birds .
result:		None

original:	Everybody returned after intermission for the miscellaneous sweepings of the Fantasy For Piano , Chorus , Orchestra In C Minor , made up by its composer to fill out one of his programs .


result:		And

original:	This two-part bridge is best described by Rev. Timothy Dwight , president of Yale College , in his `` Travels In New-England New-york '' , published in New Haven in 1821 .
result:		None

original:	A good several feet around the pool should be neither greensward nor woods , but good hard pavement .
result:		And

original:	Accompanying adults are urged to keep an alert and sensible eye on their responsibilities .
result:		And

original:	Divers must be enjoined to look before they leap , either on top of someone else or onto a pool edge .
result:		And

original:	With what resource did Prokofieff back up his Credo of words -- with torrents of powerful music .
result:		None

original:	Like this English master , Mason realizes his subjects in large , simplified masses which , though they seem effortless , are in reality the result of skilled design born of hard work and a thorough distillation of the natural form that inspired them .
result:		None

original:	How very 

result:		None

original:	I have often searched for a graphic way of impressing our superiority on those Americans who have doubts , and I think Mr. Jameson Campaigne has done it well in his new book American Might Soviet Myth .
result:		None

original:	Is Western influence greater or less than it used to be ? ?
result:		None

original:	This may be as far as the process will go .
result:		And

original:	Contrary to what has been said recently , we did not wait for `` outside pressures '' and `` world opinion '' to bring down that Communist government ; ;
result:		None

original:	No wonder , for Vientiane , the old City of Sandalwood , had become the City of Bullet Holes .
result:		None

original:	Now , in March , all Laos suffered a state of siege .
result:		None

original:	`` It's all the more tragic because it's so little deserved '' , said Mr. J. J. A. Frans , a Belgian official of the United Nations Educational , Scientific , and Cultural Organization .
result:		None

original:	Gett

result:		None

original:	The direction of that movement is determined by his perception of the truth about himself .
result:		None

original:	When we consider the tenuous hold tradition has on existence , any weakening of that hold constitutes a crisis of existence .
result:		None

original:	It would seem that history is a witness to this truth .
result:		And

original:	The anxiety it generates is misinterpreted as anxiety over private interest and threatened social status .
result:		And

original:	The best way to conceal and disguise the elements of an incest story is not to set out to write an incest story .
result:		None

original:	If I now risk some comparisons with Sons Lovers let it be clear that I am not comparing the two works or judging their merits ; ;
result:		None

original:	If we understand the rocking as an erotic symbol we can also see how well it serves as the symbol of impending tragedy .
result:		None

original:	When the child dies in Lawrence's story in a delirium th

result:		None

original:	It is this , particularly the establishment of archaeology and place-name studies on a scientific basis , which are immediately pertinent to the Saxon Shore .
result:		None

original:	Against Seebohm formidable foes have taken the field , notably F. W. Maitland , whose Domesday Book Beyond was written expressly for this purpose , and Sir Paul Vinogradoff whose The Growth Of The Manor had a similar aim .
result:		None

original:	H.L. Gray in his English Field Systems and Zachrisson's Romans , Kelts Saxons defended in part the Seebohm thesis while at the present time H.P.R. Finberg and Gordon Copley seem to fall into the Celtic survivalist camp .
result:		None

original:	Comparable visions of life are at work in Antigone and Romeo Juliet .
result:		And

original:	The entirety of the natural world is party to the action .
result:		And

original:	The fate of such men has tragic relevance because it is public .
result:		And

original:	Private tragedy became the chos


original:	So , let us remember on this day not only to thank the Almighty Who gave hope and courage to the Pilgrims , but also to place our trust in Him that He will continue to protect us in the future as He has in the past .
result:		None

original:	and ( C ) to finance , for not more than three years beyond the end of said period , such activities as are required to correlate , coordinate , and round out the results of studies and research undertaken pursuant to this Act : Provided , That funds available in any one year for research and development may , subject to the approval of the Secretary of State to assure that such activities are consistent with the foreign policy objectives of the United States , be expended in cooperation with public or private agencies in foreign countries in the development of processes useful to the program in the United States : Provided further , That every such contract or agreement made with any public or private agency in a foreign country shall c

result:		None

original:	Part 1 in both volumes is labeled `` Introduction Tables '' .
result:		And

original:	There are even newer foamed plastics that are yet to be evaluated .
result:		And

original:	In meeting the demands for urethane foam as a garment interlining , new adhesives and new methods of laminating foam to a substrate have been developed .
result:		And

original:	Then : `` There are lots of kids around here '' .
result:		None

original:	Rachel's or Virginia's reply : `` better .
result:		None

original:	Another time , without accusation : `` You never wore that scarf I bought you '' .
result:		None

original:	With each sigh , like a whip in the hand of an expert , the grass stripped something from Warren .
result:		And

original:	Then the questions came , eager , interested questions , and many compliments on his having overcome his infirmity .
result:		None

original:	So he had , so he had .
result:		None

original:	Another one comes to me and he says , ' Look here , th

result:		And

original:	I ain't going back there on account of one lousy kid '' .
result:		None

original:	Was he afraid to do anything as definite as releasing her ? ?
result:		None

original:	What would her mother be doing right now ? ?
result:		None

original:	Then a startling thing occurred .
result:		None

original:	Jarrodsville was more than three miles away , down an old dirt road that the rain had turned into a quagmire .
result:		And

original:	Then the station wagon and the Ford would seek him out again .
result:		And

original:	Then he saw something that he had not seen before , and panic gripped him again .
result:		And

original:	Then he heard them .
result:		None

original:	Now he saw them .
result:		None

original:	Then he heard a car coming from the east , and he felt as if he would break down and weep .
result:		And

original:	Of course , you know not to take clippings '' .
result:		None

original:	Even if he'd somehow missed seeing him , he wouldn't have gone off and 

result:		And

original:	Then we shall see .
result:		And

original:	The husband points the steps out with his flashlight : `` Its white stare filling her pale eyes To the blind brim with appetite , Bleaching her hands that grazed my thighs Sent us from the table in surprise To let the dishes soak all night , '' ( Mary Jane asked herself if Meredith was blushing at this line , or was it the fire ? ?
result:		And

original:	The valley stretched endlessly out ahead , scorched and baked and writhing in its heat , until it vanished into the throbbing wall of fiery orange brown haze .
result:		And

original:	All the time , she had the heat of hatred in her , like charcoal that is burning on its under side , but not visibly .
result:		And

original:	`` Add fever to our troubles '' ? ?
result:		None

original:	Goaded the oxen as he yelled .
result:		And

original:	Then came the water -- not rain , but solid sheets that sluiced down like water slopping from a bucket .
result:		And

original:	Fo

result:		None

original:	`` That's what I'm going to tell Jim '' .
result:		None

original:	Shivering with shame , he crawled to the narrow end of the rock and spat into the water .
result:		None

original:	`` What time did the Chinaman go to the dentist ? ?
result:		None

original:	Wasn't John's wife , Edythe , even more appalling , if possible ? ?
result:		None

original:	Romantic ? ?
result:		And

original:	Then there was Linda's engagement to Bobbie Evans .
result:		None

original:	You didn't see her much at Longue Vue or anywhere , for John had drifted away from the gang .
result:		And

original:	Then came the hairpin turn , the smashed Jaguar and Linda , mourning alone and lovely .
result:		And

original:	Linda felt capable of capturing the affection of the children , anxious even , since she and Bobbie had had none of their own .
result:		None

original:	Also , the money can't mean as much to Bobbie .
result:		And

original:	Nadine insisted that her sitters be reliable ! !
resul

TypeError: can only join an iterable