In [4]:
import pickle,sys,os,lda,scipy,pandas,operator
import numpy as np
from nltk import word_tokenize, sent_tokenize
bios = []
for suf in ['2000', '4000', '6000', '8000','10000' ,'12000']:
    bios += pandas.read_pickle("train-corpus/corpus"+suf+".pickle")

In [5]:
def progress(i, end_val, bar_length=50):
    percent = float(i) / end_val
    hashes = '#' * int(round(percent * bar_length))
    spaces = ' ' * (bar_length - len(hashes))
    sys.stdout.write("\r{0} / {1} Percent: [{2}] {3}%".format(i, end_val, hashes + spaces, int(round(percent * 100))))
    sys.stdout.flush()

In [6]:
bios[1]

[('Jean-Pierre Abbat',
  [('Biography',
    'Jean-Pierre Fernand Noel Abbat (June 17, 1928  August 1, 1993) was, with Dr. Fritz Hartmann, the first person in the USA to manufacture polyurethane.\n\nAbbat was born in Le Trait, Normandy to a shipbuilder. He met Marina Larde at the Sorbonne and they were married, migrating to the United States in 1953.\nIn 1962 Abbat proposed to Norman McCulloch to make a ballistically equivalent bowling pin out of polyurethane foam. Bowling pins were then made out of wood, with two cylindrical voids, and covered with a thin coating. The polyurethane pin would last much longer than the wooden pin date=August 2007. The American Bowling Congress were against the idea because it would put Brunswick and AMF, the biggest bowling pin makers, out of business date=August 2007.\n\nAbbat worked for U-Do, Mattel, Kenner, Fisher-Price, and ITT, making toys and telephone parts out of urethane and other plastics.\n\nAbbat died in Raleigh, North Carolina of colon cancer

## Find noise words (in order to filter them later)

In [7]:
tokens_freqs = dict()
i = 0
for bio in bios:
    for segment in bio[0][1]:
        for paragraph_text in segment[1].split('\n'):
            tokens = word_tokenize(paragraph_text)
            for token in tokens:
                if token in tokens_freqs:
                    tokens_freqs[token] += 1
                else:
                    tokens_freqs[token] = 1
    i += 1
    progress(i, len(bios))
# Sorder tokens by frequency in reverse order (most frequent first):
most_frequent_tokens = sorted(tokens_freqs.items(), key=operator.itemgetter(1))
most_frequent_tokens.reverse()

noise_words = set()
for token_and_freq in most_frequent_tokens:
    # We take the threshold to be 1/100 the frequency of the most frequnent token:
    if token_and_freq[1] > most_frequent_tokens[0][1] / 100:
        noise_words.add(token_and_freq[0])
    else:
        break

6000 / 6000 Percent: [##################################################] 100%

## Collect paragraph data for every biography

In [8]:
bios_df = list()
all_tokens = set()
i = 0
for bio in bios:
    data = {
        'person' : bio[0][0],
        'tokenized_paragraphs' : list(),
        'paragraph_splits' : list(),
        'word_splits': [0],
        'length' : 0,
        'segments' : 0
    }
    number_of_words = 0
    for segment in bio[0][1]:
        number_of_paragraphs = 0
        data['segments'] += 1
        for paragraph_text in segment[1].split('\n'):
            original_tokens = word_tokenize(paragraph_text)
            tokens = list()
            for token in original_tokens:
                if token not in noise_words:
                    tokens.append(token)
            
            if len(tokens) > 0:
                number_of_paragraphs += 1
                number_of_words += len(tokens)
                all_tokens |= set(tokens)
                data['tokenized_paragraphs'].append(tokens)  
        data['paragraph_splits'].append(number_of_paragraphs)
        data['word_splits'].append(number_of_words)
    data['length'] = number_of_words
    bios_df.append(data)
    i += 1
    progress(i, len(bios))

6000 / 6000 Percent: [##################################################] 100%

In [10]:
vocab = list(all_tokens)
bios_data = pandas.DataFrame(bios_df)
bios_data[:2]

Unnamed: 0,length,paragraph_splits,person,segments,tokenized_paragraphs,word_splits
0,99,"[1, 1, 1]",Leonard Rossiter,3,"[[Leonard, Rossiter, 21, October, 1926, –, 5, ...","[0, 37, 77, 99]"
1,137,[15],Jean-Pierre Abbat,1,"[[Jean-Pierre, Fernand, Noel, Abbat, June, 17,...","[0, 137]"


In [14]:
all_tokens_list = list(all_tokens)
number_of_tokens = len(all_tokens_list)
all_paragraphs = bios_data['tokenized_paragraphs'].sum()
# print(len(all_paragraphs))
# paragraphs_bow = np.zeros([len(all_paragraphs),number_of_tokens], dtype = np.int)
# tokens_indices_dict = dict()
# for i in range(number_of_tokens):
#     tokens_indices_dict[all_tokens_list[i]] = i
    
# for i in range(len(all_paragraphs)):
#     for w in all_paragraphs[i]:
#         paragraphs_bow[i][tokens_indices_dict[w]] += 1
        
#     progress(i + 1, len(all_paragraphs))

## Assign LDA topics to paragraphs with word embeddings

In [15]:
from gensim import corpora
from gensim.models import LdaMulticore
dictionary = corpora.Dictionary(all_paragraphs)
corpus = [dictionary.doc2bow(text) for text in all_paragraphs]
#lda = LdaMulticore(corpus,workers=3,id2word=dictionary, num_topics=20, passes=20)
#with open('genlda6000.pkl','wb') as f:
#    pickle.dump(lda,f)
lda = pandas.read_pickle('genlda6000.pkl')

### Using gensim lda:

In [18]:
vecs = pickle.load(open('/home/ilay/vecs.pkl','rb'))

In [None]:
lda.show_topics(num_topics=5, formatted=False, num_words=5)
#sum([w[1] for w in t[1][1]])

## Comparing Our Vectors Method with a Simple LDA based Paragraph Topic Score
Instead of adding up word vectors we can just sum up the probability that a word is in the given topic for all the paragraph words. To get the same list of topic probabilities per paragraph that we do with the vectors.
For that, though, we have to pass over the vocabulary and make a *number of topics* sized list for every word.

### Generate topic lists

In [None]:
topics_words = list()

# word_freqs = dict()

for i, topic_dist in enumerate(topic_word):
    # topic_words: words sorted by relevance to topic in descending order
    topic_words = list(np.array(vocab)[np.argsort(topic_dist)[::-1]])#[:10]#[:-(n_top_words+1):-1]
    #print(topic_words)
    topics_words.append(topic_words)
    print('Topic {}: {}'.format(i, ' '.join(topic_words[:15])))
#     for word in topic_words[:20]:
#         if word not in word_freqs:
#             word_freqs[word] = 1
#         else:
#             word_freqs[word] += 1

#word_topic_dists = dict()
#for j,word in enumerate(vocab[:10]):
#    for topic in range(lda.num_topics):
#        topiclist = lda.show_topic(topic,len(all_tokens))
#        print(topiclist[:5],topiclist[-5:])
#        i = 0
#        notfound = True
#        word_topic_dists[word] = list()
#         while notfound and i < len(all_tokens):
#             if topiclist[i][0] == word:
#                 word_topic_dists[word].append(topiclist[i][1])
#                 noutfound = False
#            i += 1
#    progress(j,10)
#word_topic_dists

In [16]:
topics_words = list()
word_topics = dict()
for topic in lda.show_topics(num_topics=20, formatted=False, num_words=len(all_tokens)):
    i += 1
    topic_words = list()
    for word,score in topic[1]:
        topic_words.append(word)
        if word in word_topics:
            word_topics[word].append(score)
        else:
            word_topics[word] = [score]
    print('Topic {}: {}'.format(topic[0], ' '.join(topic_words[:15])))
    topics_words.append(topic_words)
len(word_topics)

Topic 0: King II Duke married daughter Prince Henry Queen king Roman III Emperor Charles wife brother
Topic 1: French Paris France la des Louis et du La Jean Danish ''Le War Battle Marie
Topic 2: School College United school States William professor studied family British named worked appointed England moved
Topic 3: out against did than them said so no could This up being himself people over
Topic 4: Order der Cross und Knight Grand vols. Cavendish Berlin des History die Letters clan edited
Topic 5: ISBN Press & ** Books ed Poems Toronto ''A Other 2 Time Stories 1989 edition
Topic 6: family married wife children buried home near last daughter Cemetery They mother three house age
Topic 7: Award Best won Prize Film received Awards nominated Academy Golden 1998 Festival 1993 1997 nomination
Topic 8: World War won United career Army team age world League second season II States over
Topic 9: television role appeared show played films TV actor BBC starred radio roles comedy character stage

165255

Quick sanity check

In [52]:
for j in range(5,12):
    print("topic",j)
    for i in range(10):
        word = topics_words[j][i]
        print(word,":",word_topics[word][j])
    print("=====")

topic 5
ISBN : 0.0292459357528
Press : 0.0150213496661
& : 0.00996607470323
** : 0.00900567612708
Books : 0.00611599292641
ed : 0.00513250684778
Poems : 0.00455631778693
Toronto : 0.00447018227349
''A : 0.00409525258403
Other : 0.00406324248051
=====
topic 6
family : 0.00905247612984
married : 0.00851100646704
wife : 0.00773300219653
children : 0.00746579073769
buried : 0.00720795770982
home : 0.00619619201968
near : 0.00454598226231
last : 0.00438021809429
daughter : 0.00436837548787
Cemetery : 0.00413003008392
=====
topic 7
Award : 0.0289089655523
Best : 0.01734625488
won : 0.0116144870165
Prize : 0.00855735068612
Film : 0.00694703535124
received : 0.00686637096292
Awards : 0.00685934597184
nominated : 0.00679549673394
Academy : 0.00597625910701
Golden : 0.0058161631191
=====
topic 8
World : 0.0138170958184
War : 0.0116821800404
won : 0.00530050924546
United : 0.00511337688015
career : 0.00396996318163
Army : 0.00387049807003
team : 0.00381373633574
age : 0.00369362230112
world : 0.0

In [25]:
def paragraph_to_score(paragraph_tokens,topic_index):
    return sum(word_topics[t][topic_index] for t in paragraph_tokens)

In [26]:
def paragraph_to_vector(paragraph_tokens):
    l = len(vecs['queen']) # len of the vector is 300
    paragraph_accumulative = np.zeros(l)
    topic_ratings = []
    # just sum the paragraph words' vectors to get a semantic average of it
    for ind,word in enumerate(paragraph_tokens):
        if word in vecs:
            paragraph_accumulative += vecs[word]
    return paragraph_accumulative

### For each topic, make a representing vector by summing it's first 200 word-vectors

In [27]:
topic_vectors = list()
for topic_words in topics_words:
    words_taken = 0
    i = 0
    vector = np.zeros(300)
    while(words_taken < 200):
        word = topic_words[i]
#         if (word not in word_freqs or word_freqs[word] < 5) and word in vecs:
        if word in vecs:
            vector += vecs[word]
            words_taken += 1
        i += 1
    topic_vectors.append(vector)

In [85]:
# make a list of topics for each paragraph by distance of topic vectors from the paragraph vector
def paragraph_topics_rating_vecs(paragraph,topic_vectors):
    cosine = scipy.spatial.distance.cosine
    return np.argsort([cosine(paragraph_to_vector(paragraph),topic_vector) for i, topic_vector in enumerate(topic_vectors)])


def paragraph_topics_rating_ldascores(paragraph):
    return np.argsort([1/paragraph_to_score(paragraph,i) for i in range(lda.num_topics)])

def paragraph_topics_rating(paragraph):
    return np.argsort([sum()])

In [86]:
example_paragraph = bios_data.loc[1,'tokenized_paragraphs'][0]

In [87]:
paragraph_topics_rating_vecs(example_paragraph,topic_vectors)

array([13,  1,  0, 18,  5,  8, 11,  2,  4,  7, 14,  6, 19, 12,  9, 16, 17,
       10, 15,  3])

In [88]:
paragraph_topics_rating_ldascores(example_paragraph)

array([13,  8,  7,  5, 16,  0, 12, 19,  9, 18, 14,  6,  2,  3,  4, 15, 10,
       11, 17,  1])

## Split the text according to the topic ratings with and without vectors

In [92]:
psplits_v =  list()
wsplits_v = list()
psplits_s = list()
wsplits_s = list()

for i in range(len(bios_data)):
    last_paragraph_topics_vecs = np.array([])
    last_paragraph_topics_scores = np.array([])
    number_of_paragraphs_vecs = number_of_paragraphs_scores = 1
    number_of_words = 0
    psplit_v = list()
    psplit_s = list()
    wsplit_v = list() 
    wsplit_s = list()
    for tp in bios_data.loc[i,'tokenized_paragraphs']:
        number_of_words += len(tp)
        paragraph_topics_vecs = paragraph_topics_rating_vecs(tp,topic_vectors)[:3]
        paragraph_topics_scores = paragraph_topics_rating_ldascores(tp)[:3]
        if len(last_paragraph_topics_vecs) > 0:
            if len(np.intersect1d(paragraph_topics_vecs, last_paragraph_topics_vecs)) > 0:
                number_of_paragraphs_vecs += 1
            else:
                psplit_v.append(number_of_paragraphs_vecs)
                wsplit_v.append(number_of_words)
                number_of_paragraphs_vecs = 1
        else:
            wsplit_v.append(0)
        if len(last_paragraph_topics_scores) > 0:
            if len(np.intersect1d(paragraph_topics_scores,last_paragraph_topics_scores)) > 0:
                number_of_paragraphs_scores += 1
            else:
                psplit_s.append(number_of_paragraphs_scores)
                wsplit_s.append(number_of_words)
                number_of_paragraphs_scores = 1
        else:
            wsplit_s.append(0)
        
        last_paragraph_topics_scores = paragraph_topics_scores
        last_paragraph_topics_vecs = paragraph_topics_vecs
        
       
    if number_of_paragraphs_vecs > 0:
        psplit_v.append(number_of_paragraphs_vecs)
        wsplit_v.append(number_of_words)
    if number_of_paragraphs_scores > 0:
        psplit_s.append(number_of_paragraphs_scores)
        wsplit_s.append(number_of_words)
    
    psplits_v.append(psplit_v)
    wsplits_v.append(wsplit_v)
    psplits_s.append(psplit_s)
    wsplits_s.append(wsplit_s)
    progress(i + 1, len(bios_data))
bios_data['tst_word_splits_vecs'] = pandas.Series(wsplits_v,index=bios_data.index)
bios_data['tst_paragraph_splits_vecs'] = pandas.Series(psplits_v, index=bios_data.index)
bios_data['tst_word_splits_ldascores'] = pandas.Series(wsplits_s,index=bios_data.index)
bios_data['tst_paragraph_splits_ldascores'] = pandas.Series(psplits_s, index=bios_data.index)

6000 / 6000 Percent: [##################################################] 100%

In [93]:
bios_data[:5]

Unnamed: 0,length,paragraph_splits,person,segments,tokenized_paragraphs,word_splits,tst_word_splits_vecs,tst_paragraph_splits_vecs,tst_word_splits_ldascores,tst_paragraph_splits_ldascores
0,99,"[1, 1, 1]",Leonard Rossiter,3,"[[Leonard, Rossiter, 21, October, 1926, –, 5, ...","[0, 37, 77, 99]","[0, 77, 99, 99]","[1, 1, 1]","[0, 77, 99, 99]","[1, 1, 1]"
1,137,[15],Jean-Pierre Abbat,1,"[[Jean-Pierre, Fernand, Noel, Abbat, June, 17,...","[0, 137]","[0, 79, 108, 113, 118, 135, 136, 137]","[2, 2, 2, 2, 4, 1, 2]","[0, 31, 79, 118, 137]","[1, 1, 6, 7]"
2,720,"[5, 10, 1]",Oscar Niemeyer,3,"[[Oscar, Ribeiro, Almeida, Niemeyer, Soares, F...","[0, 356, 668, 720]","[0, 213, 395, 496, 628, 668, 720, 720]","[3, 4, 3, 3, 1, 1, 1]","[0, 385, 612, 628, 668, 720]","[6, 6, 1, 1, 2]"
3,870,"[1, 8, 17, 6, 6]",Eudoxus of Cnidus,5,"[[Eudoxus, Cnidus, IPAc-en||d||s||s, lang-, ''...","[0, 33, 261, 707, 781, 870]","[0, 314, 521, 554, 870]","[9, 5, 1, 23]","[0, 214, 256, 521, 603, 707, 747, 870]","[5, 2, 7, 5, 6, 3, 10]"
4,145,[3],Jose Saramago,1,"[[Jose, Sousa, Saramago, Order, St.GColSE, 16,...","[0, 145]","[0, 145]",[3],"[0, 145]",[3]


Our method is pretty good at not over segmenting biographies that have only one segment:

In [94]:
bios_data.loc[bios_data['segments'].isin([1])][:10]

Unnamed: 0,length,paragraph_splits,person,segments,tokenized_paragraphs,word_splits,tst_word_splits_vecs,tst_paragraph_splits_vecs,tst_word_splits_ldascores,tst_paragraph_splits_ldascores
1,137,[15],Jean-Pierre Abbat,1,"[[Jean-Pierre, Fernand, Noel, Abbat, June, 17,...","[0, 137]","[0, 79, 108, 113, 118, 135, 136, 137]","[2, 2, 2, 2, 4, 1, 2]","[0, 31, 79, 118, 137]","[1, 1, 6, 7]"
4,145,[3],Jose Saramago,1,"[[Jose, Sousa, Saramago, Order, St.GColSE, 16,...","[0, 145]","[0, 145]",[3],"[0, 145]",[3]
5,238,[4],Frederick Augustus I of Saxony,1,"[[Frederick, Augustus, full, name, ''Frederick...","[0, 238]","[0, 211, 238, 238]","[1, 2, 1]","[0, 238, 238]","[3, 1]"
13,22,[1],Fritigern,1,"[[Fritigern, Fritigernus, ca, 380, Thervingian...","[0, 22]","[0, 22]",[1],"[0, 22]",[1]
14,21,[1],Victor Lustig,1,"[[Victor, Lustig, January, 4, 1890, –, March, ...","[0, 21]","[0, 21]",[1],"[0, 21]",[1]
23,12,[1],Randal L. Schwartz,1,"[[Randal, L., Schwartz, November, 22, 1961, me...","[0, 12]","[0, 12]",[1],"[0, 12]",[1]
25,29,[1],Carlos Filipe Ximenes Belo,1,"[[Carlos, Filipe, Ximenes, Belo, SDB, GCL, 3, ...","[0, 29]","[0, 29]",[1],"[0, 29]",[1]
29,134,[5],Danny Kass,1,"[[Daniel, Danny, Kass, September, 21, 1982, pr...","[0, 134]","[0, 127, 134]","[3, 2]","[0, 134, 134]","[4, 1]"
32,89,[3],Trofim Lysenko,1,"[[Trofim, Denisovich, Lysenko, lang-, lang-, s...","[0, 89]","[0, 89]",[3],"[0, 89]",[3]
34,144,[4],Irina Privalova,1,"[[Irina, Anatoljewna, Privalova, lang-, nee, S...","[0, 144]","[0, 144]",[4],"[0, 144]",[4]


## Compare with Alexaner A Alemi and Paul Ginsparg's Method
We took the code (https://github.com/alexalemi/segmentation.git) described in this article:
http://arxiv.org/pdf/1503.05543v1.pdf  and modified it a little to fit our available embeddings DB and the presentation needs. Running it on the data gives pretty poort results, but can serve as basis for evaluation of our own method.

In [40]:
sys.path.append('segmentation/code')
from segmentation.code.segmentart import *

In [95]:
from nltk.metrics.segmentation import *
def splits_list(bio,ind,acc):
    if ind == len(bio)-1:
        return acc
    elif ind == 0:
        acc.append(len(bio[ind][1].split()))
    else:
        acc.append(acc[ind-1]+len(bio[ind][1].split()))
    return splits_list(bio,ind+1,acc)

def indexlist2binary(index_list):
    ret = "1"
    for ordinal,split_location in enumerate(index_list):
        if ordinal == 0:
            continue
        ret += "0"*(split_location - index_list[ordinal - 1])
        ret += "1"
    return ret

alexmi = []
ours = []
for i in range(len(bios_data)):
    onepiece = " ".join([" ".join(tp) for tp in bios_data.loc[i,'tokenized_paragraphs']])
    gld = bios_data.loc[i,'word_splits']
    tst = [0] + segmentize(onepiece,bios_data.loc[i,'segments'],vecs) + [bios_data.loc[i,'length']] if len(gld) > 2 else gld
    if len(tst)*len(gld) > 0:
        alexmi.append({
            'person' : bios_data.loc[i,'person'], 
            'alexmi pk' : pk(indexlist2binary(gld),indexlist2binary(tst))
        })
        ours.append({
            'person': bios_data.loc[i,'person'],
            'our pk vecs' : pk(indexlist2binary(gld),indexlist2binary(bios_data.loc[i,'tst_word_splits_vecs'])),
            'our pk lda scores' : pk(indexlist2binary(gld),indexlist2binary(bios_data.loc[i,'tst_word_splits_ldascores']))
        })
    progress(i, len(bios))    

5999 / 6000 Percent: [##################################################] 100%

In [96]:
alexmi = pandas.DataFrame(alexmi)
ours = pandas.DataFrame(ours)
alexmi.describe()

Unnamed: 0,alexmi pk
count,6000.0
mean,0.321687
std,0.242585
min,0.0
25%,0.0
50%,0.390072
75%,0.513761
max,0.815789


In [97]:
ours.describe()

Unnamed: 0,our pk lda scores,our pk vecs
count,6000.0,6000.0
mean,0.226259,0.241487
std,0.185243,0.193138
min,0.0,0.0
25%,0.0,0.0
50%,0.215798,0.235638
75%,0.34434,0.366356
max,0.982456,0.966102


So we can see that even though we let the Alexmi code off all one-segment biographies, both our methods mean is still better.
But, we can also see that just using the LDA output to gauge the association between a paragraph and every topic, is slightly better than our word embedding method.
However, it's probably not good enough for feeding to the segment classifier and getting good results.