In [8]:
import pickle,sys,os,lda,scipy,pandas
import numpy as np
from nltk import word_tokenize, sent_tokenize
bios = []
for suf in ['18000']:
    bios += pandas.read_pickle("train-corpus/corpus"+suf+".pickle")

In [9]:
def progress(i, end_val, bar_length=50):
    percent = float(i) / end_val
    hashes = '#' * int(round(percent * bar_length))
    spaces = ' ' * (bar_length - len(hashes))
    sys.stdout.write("\r{0} / {1} Percent: [{2}] {3}%".format(i, end_val, hashes + spaces, int(round(percent * 100))))
    sys.stdout.flush()

In [10]:
bios[1]

[('John Renshaw Starr',
  [('Summary',
    "John Renshaw Starr (died 1996), was one of two sons of Alfred Demarest Starr (an American) and Ethel Renshaw (English). He was a grandson of William Robert Renshaw. He was an artist and a soldier during the Second World War. His story is told in a book, ''The Starr Affair'', by Jean Overton Fuller.\n\n"),
   ('Release',
    'By exploiting his ability to pass himself off as a Frenchman, he joined a group of French and Belgian prisoners who were released into the custody of the Red Cross and taken to Switzerland as the war in Europe drew to a close.\n\nStories from other SOE agents who shared his captivity at the Avenue Foch resulted in doubts being raised about his loyalty, and his case became the subject of an MI5 investigation, which concluded that although his behaviour was certainly suspicious, there were no grounds for criminal prosecution. \n\n')])]

## Collect paragraph data for every biography

In [11]:
bios_df = list()
all_tokens = set()
i = 0
for bio in bios:
    data = {
        'person' : bio[0][0],
        'tokenized_paragraphs' : list(),
        'paragraph_splits' : list(),
        'word_splits': [0],
        'length' : 0,
        'segments' : 0
    }
    number_of_words = 0
    for segment in bio[0][1]:
        number_of_paragraphs = 0
        data['segments'] += 1
        for paragraph_text in segment[1].split('\n'):
            tokens = word_tokenize(paragraph_text)
            if len(tokens) > 0:
                number_of_paragraphs += 1
                number_of_words += len(tokens)
                all_tokens |= set(tokens)
                data['tokenized_paragraphs'].append(tokens)  
        data['paragraph_splits'].append(number_of_paragraphs)
        data['word_splits'].append(number_of_words)
    data['length'] = number_of_words
    bios_df.append(data)
    i += 1
    progress(i, len(bios))

1000 / 1000 Percent: [##################################################] 100%

In [12]:
bios_data = pandas.DataFrame(bios_df)
bios_data[:2]

Unnamed: 0,length,paragraph_splits,person,segments,tokenized_paragraphs,word_splits
0,407,"[1, 4]",Samuel Cornish,2,"[[Samuel, Eli, Cornish, (, 1795, 6, November, ...","[0, 89, 407]"
1,164,"[1, 2]",John Renshaw Starr,2,"[[John, Renshaw, Starr, (, died, 1996, ), ,, w...","[0, 68, 164]"


In [13]:
all_tokens_list = list(all_tokens)
number_of_tokens = len(all_tokens_list)
all_paragraphs = bios_data['tokenized_paragraphs'].sum()
paragraphs_bow = np.zeros([len(all_paragraphs),number_of_tokens], dtype = np.int)
tokens_indices_dict = dict()
for i in range(number_of_tokens):
    tokens_indices_dict[all_tokens_list[i]] = i
    
for i in range(len(all_paragraphs)):
    for w in all_paragraphs[i]:
        paragraphs_bow[i][tokens_indices_dict[w]] += 1
        
    progress(i + 1, len(all_paragraphs))

10329 / 10329 Percent: [##################################################] 100%

## Assign LDA topics to paragraphs with word embeddings

In [14]:
vocab = list(all_tokens)
paragraphs_bow.shape

(10329, 45763)

In [15]:
model = lda.LDA(n_topics=20, n_iter=1500, random_state=1)
model.fit(paragraphs_bow)  # model.fit_transform(X) is also available

<lda.lda.LDA at 0x7ff72163a518>

In [None]:
#with open('toy_topics.pkl','wb') as toy:
#    pickle.dump(model,toy)

In [29]:
#model = pandas.read_pickle('toy_topics.pkl')

### Generate topic lists

In [19]:
topic_word = model.topic_word_ 
n_top_words = 20

topics_words = list()

word_freqs = dict()

for i, topic_dist in enumerate(topic_word):
    # topic_words: words sorted by relevance to topic in descending order
    topic_words = list(np.array(vocab)[np.argsort(topic_dist)[::-1]])#[:10]#[:-(n_top_words+1):-1]
    #print(topic_words)
    topics_words.append(topic_words)
    print('Topic {}: {}'.format(i, ' '.join(topic_words[:15])))
    for word in topic_words[:20]:
        if word not in word_freqs:
            word_freqs[word] = 1
        else:
            word_freqs[word] += 1

Topic 0: the to . , his and a was of he in that for with had
Topic 1: , and '' . of the in a his by with for 's ''The as
Topic 2: the of , . for in and Award In ( ) at was has ''
Topic 3: of , the . and was in his ) ( II Duke to son King
Topic 4: '' , the . in and a as on of `` 's for film In
Topic 5: the of and , . in his as to a is on by which also
Topic 6: , of the . ) ( de '' was a in to and ; 's
Topic 7: , ) ( was . and a born an is of He American known May
Topic 8: the of , . in to and a as was from for on 's In
Topic 9: , . and New of York '' with : by John `` ; * 's
Topic 10: , the . to in and ( at first ) he He of his Open
Topic 11: , . her in and she a was She of married had to 's on
Topic 12: , in . the of and he was a at He to his University from
Topic 13: , the in . and for a ( of ) on He with League was
Topic 14: the , of . in and was War to a as 's Army during World
Topic 15: , '' the . `` of to that a and is 's I it in
Topic 16: the , . in a and on of was at 's is to Th

In [16]:
vecs = pickle.load(open('/home/ilay/vecs.pkl','rb'))

In [17]:
def paragraph_to_vector(paragraph_tokens):
    l = len(vecs['queen']) # len of the vector is 300
    paragraph_accumulative = np.zeros(l)
    topic_ratings = []
    # just sum the paragraph words' vectors to get a semantic average of it
    for ind,word in enumerate(paragraph_tokens):
        if word in vecs:
            paragraph_accumulative += vecs[word]
    return paragraph_accumulative

### For each topic, make a representing vector by summing it's first 200 word-vectors

In [20]:
topic_vectors = list()
for topic_words in topics_words:
    words_taken = 0
    i = 0
    vector = np.zeros(300)
    while(words_taken < 200):
        word = topic_words[i]
        if (word not in word_freqs or word_freqs[word] < 5) and word in vecs:
            vector += vecs[word]
            words_taken += 1
        i += 1
    topic_vectors.append(vector)

In [21]:
# make a list of topics for each paragraph by distance of topic vectors from the paragraph vector
def paragraph_topics_rating(paragraph,topic_vectors):
    cosine = scipy.spatial.distance.cosine
    return np.argsort([cosine(paragraph_to_vector(paragraph),topic_vector) for i, topic_vector in enumerate(topic_vectors)])

In [22]:
paragraph_topics_rating(bios_data.loc[1,'tokenized_paragraphs'][0],topic_vectors)

array([11,  7, 12, 16,  9,  3,  6,  4, 14, 13,  0,  1, 10, 19,  2, 15,  5,
        8, 18, 17])

This means topic 11 is most strongly linked to this pargraph, then topic 7, then 12 etc.

## Split the text using according to the topic ratings.

In [24]:
psplits = list()
wsplits = list()
for i in range(len(bios_data)):
    last_paragraph_topics = np.array([])
    number_of_paragraphs = 1
    number_of_words = 0
    psplit = list()
    wsplit = list()
    for tp in bios_data.loc[i,'tokenized_paragraphs']:
        number_of_words += len(tp)
        paragraph_topics = paragraph_topics_rating(tp,topic_vectors)[:3]
        if len(last_paragraph_topics) > 0:
            if len(np.intersect1d(paragraph_topics, last_paragraph_topics)) > 0:
                number_of_paragraphs += 1
            else:
                psplit.append(number_of_paragraphs)
                wsplit.append(number_of_words)
                number_of_paragraphs = 1
        else:
            wsplit.append(0)
        last_paragraph_topics = paragraph_topics
        
       
    if number_of_paragraphs > 0:
        psplit.append(number_of_paragraphs)
        wsplit.append(number_of_words)
    psplits.append(psplit)
    wsplits.append(wsplit)
    progress(i + 1, len(bios_data))
bios_data['tst_word_splits'] = pandas.Series(wsplits,index=bios_data.index)
bios_data['tst_paragraph_splits'] = pandas.Series(psplits, index=bios_data.index)

1000 / 1000 Percent: [##################################################] 100%

In [25]:
bios_data[:5]

Unnamed: 0,length,paragraph_splits,person,segments,tokenized_paragraphs,word_splits,tst_word_splits,tst_paragraph_splits
0,407,"[1, 4]",Samuel Cornish,2,"[[Samuel, Eli, Cornish, (, 1795, 6, November, ...","[0, 89, 407]","[0, 407]",[5]
1,164,"[1, 2]",John Renshaw Starr,2,"[[John, Renshaw, Starr, (, died, 1996, ), ,, w...","[0, 68, 164]","[0, 113, 164]","[1, 2]"
2,85,"[1, 2]",George Reginald Starr,2,"[[George, Reginald, Starr, DSO, MC, (, 6, Apri...","[0, 34, 85]","[0, 73, 85]","[1, 2]"
3,306,"[5, 1]","Claire Windsor, Countess of Ulster",2,"[[''Gloucester, family, banner, '', name, =mar...","[0, 264, 306]","[0, 145, 204, 306]","[2, 1, 3]"
4,171,"[1, 1]",Tom Campbell (California politician),2,"[[Thomas, John, ``, Tom, '', Campbell, (, born...","[0, 161, 171]","[0, 171]",[2]


Our method is pretty good at not over segmenting biographies that have only one segment:

In [76]:
bios_data.loc[bios_data['segments'].isin([1])][:10]

Unnamed: 0,length,paragraph_splits,person,segments,tokenized_paragraphs,word_splits,tst_word_splits,tst_paragraph_splits
5,153,[2],Sima Lun,1,"[[TitlesMarquess, of, Anle, Pavilion, Viscount...","[0, 153]","[0, 153, 153]","[1, 1]"
6,203,[5],Carlo Antonio Campioni,1,"[[Carlo, Antonio, Campioni, (, November, 16, ,...","[0, 203]","[0, 162, 203]","[3, 2]"
10,238,[11],Charles Pinckney (South Carolina chief justice),1,"[[Charles, Pinckney, (, died, October, 29, ,, ...","[0, 238]","[0, 212, 217, 223, 229, 237, 238, 238]","[3, 1, 2, 1, 2, 1, 1]"
11,89,[2],Eliza Lucas,1,"[[Eliza, Lucas, Pinckney, (, December, 28, ,, ...","[0, 89]","[0, 89]",[2]
13,93,[2],Dorothy Loudon,1,"[[Best, Leading, Actress, in, a, Musical, '', ...","[0, 93]","[0, 93]",[2]
15,43,[1],David Michelinie,1,"[[David, Michelinie, (, born, May, 6, ,, 1948,...","[0, 43]","[0, 43]",[1]
16,174,[9],"James L. Miller, Sr.",1,"[[James, L., Miller, ,, Sr., (, 1897-1989, ), ...","[0, 174]","[0, 168, 174]","[4, 5]"
25,195,[2],Alton B. Parker,1,"[[Alton, Brooks, Parker, (, May, 14, ,, 1852, ...","[0, 195]","[0, 195]",[2]
26,227,[3],Mumtaz Mahal,1,"[[Mumtaz, Mahal, (, 1, September, 1593, –, 17,...","[0, 227]","[0, 227]",[3]
28,284,[9],Larry Grantham,1,"[[American, Football, League, All-AFL, All-Tim...","[0, 284]","[0, 15, 34, 284]","[2, 1, 6]"


## Compare with Alexaner A Alemi and Paul Ginsparg's Method
We took the code (https://github.com/alexalemi/segmentation.git) described in this article:
http://arxiv.org/pdf/1503.05543v1.pdf  and modified it a little to fit our available embeddings DB and the presentation needs. Running it on the data gives pretty poort results, but can serve as basis for evaluation of our own method.

In [26]:
sys.path.append('segmentation/code')
from segmentation.code.segmentart import *

In [27]:
from nltk.metrics.segmentation import *
def splits_list(bio,ind,acc):
    if ind == len(bio)-1:
        return acc
    elif ind == 0:
        acc.append(len(bio[ind][1].split()))
    else:
        acc.append(acc[ind-1]+len(bio[ind][1].split()))
    return splits_list(bio,ind+1,acc)

def indexlist2binary(index_list):
    ret = "1"
    for ordinal,split_location in enumerate(index_list):
        if ordinal == 0:
            continue
        ret += "0"*(split_location - index_list[ordinal - 1])
        ret += "1"
    return ret

alexmi = []
ours = []
for i in range(len(bios_data)):
    onepiece = " ".join([" ".join(tp) for tp in bios_data.loc[i,'tokenized_paragraphs']])
    gld = bios_data.loc[i,'word_splits']
    tst = [0] + segmentize(onepiece,bios_data.loc[i,'segments'],vecs) + [bios_data.loc[i,'length']] if len(gld) > 2 else gld
    if len(tst)*len(gld) > 0:
        alexmi.append({
            'person' : bios_data.loc[i,'person'], 
            'alexmi pk' : pk(indexlist2binary(gld),indexlist2binary(tst))
        })
        ours.append({
                'person': bios_data.loc[i,'person'],
                'our pk' : pk(indexlist2binary(gld),indexlist2binary(bios_data.loc[i,'tst_word_splits']))
            })
    progress(i, len(bios))    

999 / 1000 Percent: [##################################################] 100%

In [28]:
alexmi = pandas.DataFrame(alexmi)
ours = pandas.DataFrame(ours)
alexmi.describe()

Unnamed: 0,alexmi pk
count,1000.0
mean,0.329642
std,0.24612
min,0.0
25%,0.0
50%,0.404736
75%,0.530077
max,0.792393


In [29]:
ours.describe()

Unnamed: 0,our pk
count,1000.0
mean,0.200499
std,0.163165
min,0.0
25%,0.0
50%,0.204589
75%,0.309313
max,0.830357


So we can see that even though we let the Alexmi code off all one-segment biographies, our method's mean is still better.
It's probably not good enough for feeding to the segment classifier and getting good results.
And there's a problem assessing segment classification where the number of segments is unequeal. Still,
we can try to run the classifier on a few and see what happens:

In [67]:
# turn our segmentation into something the classifer can use
def psplits2dformat(orig,df):
    orig_format = list()
    for i in range(len(df)):
        if orig[i][0][0] != df.loc[i,'person']:
            print("biography missmatch:", orig[i][0][0] , df.loc[i,'person'])
            return False
        else:
            bio = [(df.loc[i,'person'],[])]
            added = 0
            for pcount in df.loc[i,'tst_paragraph_splits']:
                segment = ""
                for j in range(pcount):
                    segment += " ".join(df.loc[i,'tokenized_paragraphs'][added])+"\n"
                    added += 1
                bio[0][1].append(('?',segment))
            orig_format.append(bio)
    return orig_format

In [60]:
pickleme = psplits2dformat(bios,bios_data)

999 / 1000 Percent: [##################################################] 100%

In [62]:
pickleme[10]

[('Charles Pinckney (South Carolina chief justice)',
  [('?',
    "Charles Pinckney ( died October 29 , 1758 ) was a noted South Carolina politician and colonial agent . He was also the father of two candidates for Vice-President and President . For four presidential elections in a row , from 1796 to 1808 , one of his sons would receive votes in the Electoral College .\nPinckney was long prominent in colonial affairs , serving as attorney general of the Province of South Carolina in 1733 , speaker of the assembly in 1736 , 1738 and 1740 , chief justice of the province in 1752–1753 , and agent for South Carolina in England in 1753–1758 .\nPinckney married Eliza Lucas as his second wife in 1744 . Three of their children lived to adulthood : Charles Cotesworth , a signer of the U.S. Constitution and the Federalist candidate for President in 1804 and 1808 and Vice-President in 1800 ; Harriott , who married Daniel Horry ; and Thomas , who negotiated Pinckney 's Treaty with Spain in 1795 and

In [63]:
bios[10]

[('Charles Pinckney (South Carolina chief justice)',
  [('Biography',
    "Charles Pinckney (died October 29, 1758) was a noted South Carolina politician and colonial agent. He was also the father of two candidates for Vice-President and President. For four presidential elections in a row, from 1796 to 1808, one of his sons would receive votes in the Electoral College. \n\nPinckney was long prominent in colonial affairs, serving as attorney general of the Province of South Carolina in 1733, speaker of the assembly in 1736, 1738 and 1740, chief justice of the province in 1752–1753, and agent for South Carolina in England in 1753–1758.\n\nPinckney married Eliza Lucas as his second wife in 1744.  Three of their children lived to adulthood: Charles Cotesworth, a signer of the U.S. Constitution and the Federalist candidate for President in 1804 and 1808 and Vice-President in 1800; Harriott, who married Daniel Horry; and Thomas, who negotiated Pinckney's Treaty with Spain in 1795 and was the

In [66]:
with open("segmented18000.pkl","wb") as seg:
    pickle.dump(pickleme,seg)