In [1]:
from music21 import stream, interval, corpus, instrument, pitch
from music21 import converter, note, chord, environment, duration, key
import notebook
import argparse
import pandas as pd
import string
import pathlib
from sklearn import preprocessing
import numpy as np
import json
import re
import random

In [2]:
from dwbzen  import *
from dwbzen.common import TextParser
from dwbzen.common import WordCollector
from dwbzen.common import CharacterCollector
from dwbzen.common import WordProducer
from dwbzen.common import SentenceProducer
from dwbzen.common import MarkovChain
from markovify import split_into_sentences
from nltk.corpus import stopwords
from nltk import word_tokenize

## Text processing testing

In [3]:
#filename = "/data/text/Followed By Madness (parts 1 2).txt"
filename = "/data/text/Followed By Madness (part 1).txt"
text_parser = TextParser(source=filename, ignore_case=True, maxlines=None, remove_stop_words=False)
word_counts = text_parser.get_word_counts(sort_counts=True, reverse=True)
words = text_parser.get_words()
sentence_words = text_parser.get_sentence_words()
sentences = text_parser.get_sentences()
lines = text_parser.get_lines()
print(text_parser.counts_df.head(10))
print(f'number of words: {len(words)}')
print(f'number of sentence words: {len(sentence_words)}')
print(f'number of sentences: {len(sentences)}')
print(f'number of lines: {len(lines)}')

  word  count
0  the    714
1    a    426
2  and    411
3    i    402
4   of    292
5   to    263
6   my    240
7   in    195
8   he    137
9  she    130
number of words: 13495
number of sentence words: 13495
number of sentences: 1218
number of lines: 428


In [4]:
order = 2
word_keys = [ (' '.join(sentence_words[i:i+order]), sentence_words[i+order]) for i in range(0, len(sentence_words)-2) ]
word_keys[0:5]

[(' my grandfather', 'appoints'),
 ('grandfather appoints', 'me'),
 ('appoints me', 'an'),
 ('me an', 'honorary'),
 ('an honorary', 'electrical')]

In [16]:
wc = WordCollector(state_size=2, verbose=1, source=filename, ignore_case=True, remove_stop_words=False)
wc.processing_mode = 'words'

In [18]:
txt = [' i', 'finally', 'reach', 'the', 'base', 'of', 'a', 'large', 'transformer', 'two', 'more', 'steps', 'and', "i'll", 'reach', 'the', 'junction', 'of', 'high-tension', 'wires.', '.']
counts_df = wc.process_words(txt)
counts_df.head()

Unnamed: 0,reach,the,base,of,a,large,transformer,two,more,steps,and,i'll,junction,high-tension,wires.,.
i finally,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
finally reach,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
reach the,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
the base,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
base of,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
counts_df = wc.process_sentences(sentences)

process_words execution time: 0:00:00.020999
process_words execution time: 0:00:00.010000
process_words execution time: 0:00:00.016998
process_words execution time: 0:00:00.002997
process_words execution time: 0:00:00
process_words execution time: 0:00:00.014995
process_words execution time: 0:00:00.015001
process_words execution time: 0:00:00.007003
process_words execution time: 0:00:00.004997
process_words execution time: 0:00:00.026000
process_words execution time: 0:00:00.015000
process_words execution time: 0:00:00.021000
process_words execution time: 0:00:00.020001
process_words execution time: 0:00:00.012001
process_words execution time: 0:00:00.019995
process_words execution time: 0:00:00.012007
process_words execution time: 0:00:00.009997
process_words execution time: 0:00:00.009000
process_words execution time: 0:00:00.022999
process_words execution time: 0:00:00.005000
process_words execution time: 0:00:00.012001
process_words execution time: 0:00:00
process_words execution 

process_words execution time: 0:00:00.213004
process_words execution time: 0:00:00.342007
process_words execution time: 0:00:00.178002
200 of 1218
process_words execution time: 0:00:00.186007
process_words execution time: 0:00:00.328006
process_words execution time: 0:00:00.560015
process_words execution time: 0:00:00.166988
process_words execution time: 0:00:00.462014
process_words execution time: 0:00:00.831017
process_words execution time: 0:00:00.237007
process_words execution time: 0:00:00.325007
process_words execution time: 0:00:00.775019
process_words execution time: 0:00:01.129029


KeyboardInterrupt: 

In [5]:
word_keys_pd =  pd.DataFrame(data=word_keys, columns=['key','word'])
word_keys_pd.head()

Unnamed: 0,key,word
0,my grandfather,appoints
1,grandfather appoints,me
2,appoints me,an
3,me an,honorary
4,an honorary,electrical


In [6]:
words_ser = word_keys_pd.value_counts(ascending=False)
print(words_ser)

key           word  
 i don't      know      10
in front      of        10
out of        the        8
a little      girl       6
front of      the        5
                        ..
pair of       hands      1
              cuffs      1
painting in   black      1
painting and  stares     1
 a bang       on         1
Length: 12920, dtype: int64


In [7]:
words_ser.index

MultiIndex([(    ' i don't',    'know'),
            (    'in front',      'of'),
            (      'out of',     'the'),
            (    'a little',    'girl'),
            (    'front of',     'the'),
            (   ' maxi and',       'i'),
            (    'he tells',      'me'),
            (       'as if',       'i'),
            (  'shakes his',    'head'),
            (     'back of',     'the'),
            ...
            (    'panic as',      'if'),
            (   'panic and',    'dive'),
            ('paltry sales',   ' then'),
            (   'pall mall',     'and'),
            (     'pair of', 'slender'),
            (     'pair of',   'hands'),
            (     'pair of',   'cuffs'),
            ( 'painting in',   'black'),
            ('painting and',  'stares'),
            (     ' a bang',      'on')],
           names=['key', 'word'], length=12920)

In [26]:
df = pd.DataFrame(words_ser, columns=['count'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,count
key,word,Unnamed: 2_level_1
i don't,know,10
in front,of,10
out of,the,8
a little,girl,6
front of,the,5
...,...,...
pair of,hands,1
pair of,cuffs,1
painting in,black,1
painting and,stares,1


In [42]:
counts_df = df.reset_index()
print(f' number of keys: {len(counts_df)}')
print(counts_df.head())

 number of keys: 12920
        key  word  count
0   i don't  know     10
1  in front    of     10
2    out of   the      8
3  a little  girl      6
4  front of   the      5


In [43]:
counts_df[counts_df['key'] == ' my grandfather']

Unnamed: 0,key,word,count
4926,my grandfather,has,1
4927,my grandfather,appoints,1


In [45]:
# the initial keys
initial_keys_df = counts_df[ [x.startswith(' ') for x in counts_df['key']] ]
initial_keys_df

Unnamed: 0,key,word,count
0,i don't,know,10
5,maxi and,i,5
20,i try,to,4
29,the little,girl,3
40,i ask,him,3
...,...,...,...
4980,maxi pushes,his,1
4981,maxi pouts,and,1
4982,maxi pokes,the,1
4983,maxi looks,at,1


In [12]:
count_df[count_df['key'] == 'grandfather appoints']

Unnamed: 0,key,word,count
2824,grandfather appoints,me,1


In [39]:
akey = " i don't"
key_df = count_df[count_df['key'] ==  akey]
key_df

Unnamed: 0,key,word,count
0,i don't,know,10
4222,i don't,get,1
4223,i don't,even,1
4292,i don't,think,1


In [40]:
markov_chain_dict = dict()
#
# build the markov chain dynamically - as needed
#
s = (key_df['count']/key_df['count'].sum()).cumsum()
keyprobs_df = key_df.assign(prob=s )
keyprobs_df
markov_chain_dict[akey] = keyprobs_df

In [41]:
markov_chain_dict

{" i don't":            key   word  count      prob
 0      i don't   know     10  0.769231
 4222   i don't    get      1  0.846154
 4223   i don't   even      1  0.923077
 4292   i don't  think      1  1.000000}

In [3]:
from datetime import datetime
#
# how long to build the entire chain?
#
now = datetime.now()

#filename = "/data/text/Followed By Madness (final).txt"
#filename = "/data/text/Followed By Madness (part 1).txt"
filename = "/data/text/Followed By Madness (parts 1 2).txt"
text_parser = TextParser(source=filename, ignore_case=True, maxlines=None, remove_stop_words=False)
word_counts = text_parser.get_word_counts(sort_counts=True, reverse=True)
words = text_parser.get_words()
sentence_words = text_parser.get_sentence_words()
sentences = text_parser.get_sentences()
lines = text_parser.get_lines()
line_words = text_parser.get_line_words()
print(text_parser.counts_df.head(10))
print(f'number of words: {len(words)}')
print(f'number of sentence words: {len(sentence_words)}')
print(f'number of sentences: {len(sentences)}')
print(f'number of lines: {len(lines)}')
print(f'number of line words: {len(line_words)}')

delta = datetime.now() - now
print(f'parser time: {delta}')
now = datetime.now()

order = 2
processing_mode = 'lines'
#
# create the counts DataFrame from sentence_words or line_words
#
if processing_mode == 'sentences':
    word_keys = [ (' '.join(sentence_words[i:i+order]), sentence_words[i+order]) for i in range(0, len(sentence_words)-2) ]
elif processing_mode == 'lines':
    word_keys = [ (' '.join(line_words[i:i+order]), line_words[i+order]) for i in range(0, len(line_words)-2) ]
else:  # 'words' processing_mode
    word_keys =  [ (' '.join(words[i:i+order]), words[i+order]) for i in range(0, len(words)-2) ]

#
# create counts_df DataFrame
#
word_keys_pd =  pd.DataFrame(data=word_keys, columns=['key','word'])
words_ser = word_keys_pd.value_counts(ascending=False)
df = pd.DataFrame(words_ser, columns=['count'])
counts_df = df.reset_index()
print(f'total number of keys: {len(counts_df)}\ncounts_df:')
print(counts_df.head())

delta = datetime.now() - now
print(f'build time: {delta}\n')

# the initial keys
initial_keys_df = counts_df[ [x.startswith(' ') for x in counts_df['key']] ]
print(f'total number of initial keys: {len(initial_keys_df)}\n')
print(initial_keys_df.head())


  word  count
0  the   1307
1    a    758
2  and    753
3    i    695
4   of    562
5   to    478
6   my    424
7   in    379
8  she    240
9  her    225
number of words: 24659
number of sentence words: 24659
number of sentences: 2178
number of lines: 730
number of line words: 24659
parser time: 0:00:01.968000
total number of keys: 23287
counts_df:
        key  word  count
0  in front    of     17
1    out of   the     14
2    a pair    of     11
3   i don't  know     10
4  front of   the     10
build time: 0:00:00.060997

total number of initial keys: 654

            key  word  count
3       i don't  know     10
94      what do   you      3
119    maxi and     i      3
185   scott and     i      3
213      say no  more      3


In [4]:
initial_keys = list(set(initial_keys_df['key'].values.tolist()))
keys = list(set(counts_df['key'].values.tolist()))

In [5]:
#
# the MarkovChain model is a dictionary of DataFrames
# where the key is a string of n-words (n=order of the chain)
# that appear in the source text,
# and the DataFrame with the columns: key, word, count (from count_df) and prob
# The probabilities are computed from the counts
#
def add_counts_to_model(model: dict, countsdf: pd.DataFrame, akey: str):
    key_df = countsdf[countsdf['key'] ==  akey]
    s = (key_df['count']/key_df['count'].sum()).cumsum()
    model[akey] = key_df.assign(prob=s ) 

markov_chain_dict = dict()
now = datetime.now()

#
# create the MarkovChain dict() model only for initial keys!
# 
for akey in initial_keys:
    add_counts_to_model(markov_chain_dict, counts_df, akey)

delta = datetime.now() - now
print(delta)

0:00:02.178003


In [6]:
markov_chain_dict[" i don't"]

Unnamed: 0,key,word,count,prob
3,i don't,know,10,0.833333
8015,i don't,think,1,0.916667
8025,i don't,need,1,1.0


In [64]:
# pick a key at random
kprob = random.randint(0, len(initial_keys_df))
key = initial_keys_df.iloc[kprob]['key']
#key = " i don't"
#
# selection
#
if not key in markov_chain_dict:
    add_counts_to_model(markov_chain_dict, counts_df, key)

prob = random.random()
mdf = markov_chain_dict[key]
word = None
if len(mdf) == 0:
    word = '.'
elif len(mdf) == 1:
    word = mdf.iloc[0]['word']
elif len(mdf) > 1:
    word = mdf[mdf['prob'] >= prob].iloc[0]['word']
print(f'{key} {word} ')

 i'll bet  i 


In [65]:
mdf

Unnamed: 0,key,word,count,prob
8271,i'll bet,i,1,1.0


In [8]:
print(len(counts_df[counts_df['count'] == 1]))
print(len(counts_df[counts_df['count'] > 1]))
# most rows in counts_df - like 96% - have a count of 1
# to save space,consider only saving the entries with > 1 row
#
order

22354
933


2

### Timings
total number of keys: 12920   total number of initial keys: 1081   time: 0:00:32.547004

total number of keys: 23339   total number of initial keys: 1860   time: 0:00:58.657281

total number of keys: 1860    time: 06.495073  (initial keys only)

total number of keys: 8426    time: 0:01:52.680016 (initial keys only) for the entire Madness book text

In [102]:
#
# better to save only the counts_df in the Collector
# and maybe a MarkovChain model built from the initial keys only
# In the Producer, compute markov_chain_dict entries as needed
#

In [8]:
#
# creates a MarkovChain DataFrame from the markov_chain_dict dict()
#
chain_df = None
for key in markov_chain_dict.keys():
    keycount_df = markov_chain_dict[key]
    if chain_df is None:
        chain_df = pd.DataFrame(keycount_df)
    else:
        chain_df = pd.concat([chain_df, keycount_df], ignore_index= True)
# 

In [9]:
markovChain = MarkovChain(order, counts_df, chain_dict=markov_chain_dict, myname='madness12')
chain_df = markovChain.chain_df
chain_df.head()

Unnamed: 0,key,word,count,prob
0,at my,dirty,1,1.0
1,ran away,maxi,1,1.0
2,hey scott,you,1,1.0
3,there's a,door,1,1.0
4,do i,run,1,1.0


In [10]:
xdf = chain_df[chain_df['key']== " i don't"]
xdf

Unnamed: 0,key,word,count,prob
473,i don't,know,10,0.833333
474,i don't,think,1,0.916667
475,i don't,need,1,1.0


In [18]:
#
# the current collector method
# This won't work with the revised counts_df structure
#
def create_chain(sort_chain=True):
    """Create the MarkovChain from the counts_df by summing probabilities

    """
    markovChain =  MarkovChain(order)
    #
    # update the counts DataFrame and sort if needed
    #
    counts_df.rename_axis('KEY', inplace=True)
    if sort_chain:
        counts_df.sort_index('index', ascending=True, inplace=True)
        counts_df.sort_index(axis=1, ascending=True, inplace=True)

    sums = counts_df.sum(axis=1)
    chain_df = counts_df.div(sums, axis=0)
    chain_df.rename_axis('KEY', inplace=True)
    chain_df = chain_df.applymap(lambda x: Utils.round_values(x, 6))
    markovChain.chain_df = chain_df
    return markovChain


In [117]:
def file_reader(filename):
    for row in open(filename, 'r'):
        yield row

In [122]:
#
# use a generator to read the file lines
#
filename = "/data/text/Followed By Madness (final).txt"
nrows = 0
for row in file_reader(filename):
    nrows += 1
    print(f'{nrows} {row}')

print(nrows)

1 Blackout

2 My grandfather appoints me an honorary electrical engineer for Niagara Mohawk power company. Not a bad job for a six-year-old kid.

3 	“Of course, you'll need training,” he tells me in a very serious voice. “Could take years.”

4 	“Years?”

5 	He pulls a filter cigarette from his shirt pocket and lights it. A blue smoke haze settles around his head like a forlorn halo. “Now don't get excited,” he says. “I'll teach you the ropes.” He snickers and looks around the way he usually does when he's about do something that will annoy my folks. “To start with, an expert pole jockey needs one of these.” He rolls up his sleeve and shows off a fading tattoo he got while in the navy. The faded figure of a blue mermaid swims from elbow to wrist on Pa's left forearm. 

6 	I stare down at my own painfully skinny arm. “I don't think Dad would let me get a tattoo,” I say in a pleading kind of voice.

7 	Pa comes to rescue, “No, I suppose not. Better wait until you're a bit older. In the me

1151 	Before I could answer, she pulled me close and just stared into my eyes. My mind switched off for a moment. I caressed her face in my hands and kissed her softly on the mouth. She kissed back aggressively, running her tongue along my teeth. I don't remember if I was more amused or turned-on, but it lasted only a moment. She broke off suddenly and moved a respectful distance away, leaving me wanting more, but with the sinking thought that more wasn't going to happen. The feeling was like getting punched in the gut but without getting physically hit—it knocked the mental wind out of me.

1152 	“Diane K.,” she said.

1153 	“I'm sorry, what?”

1154 	“You asked my name, and that was a horrible pickup line by the way. When are you going to wake up and stop chasing the meridian?”

1155 	She laughed. It was delightful, but it confused the crap out of me. “What? I don't get it,” was the best I could come up with.

1156 	“No, Don, you don't. And that's just the problem!”

1157 

1158 Roche

2278 	I nod, walk across the room to the fridge, and survey the contents for anything remotely drinkable. Under the flickering light, a can of light beer remains. Hidden in the back behind the beer and a half-full jar of dill pickles is a single cupcake, a mound of creamy chocolate frosting wrinkled and stiff. A lone candle, one of those novelty candles of the kind you can't blow out, is planted in the center, and I remember that Karen's birthday is this week. I push the beer aside, take out the cupcake, and after finding some kitchen matches next to a pack of Marlboros, light the candle. It sputters in the heavy air. I pull the string overhead killing the light, plunging the room into a cloying darkness. I hear Karen's quiet breathing across the room; the feeble candlelight casts a shadow of the bare bulb overhead, swinging like a gallows against naked walls.

2279 	“Happy birthday, Karen!” I present the cupcake as if it were a priceless gift.

2280 	For a brief moment, her face glows

3410 	Another short delay. “We're going out of town this weekend for a few days. Can you do Friday?”

3411 	The dreaded “we” word, and over a weekend. Still, she wants to talk about the book.

3412 	“That works. Hey, is everything okay?”

3413 	Her answer is immediate and upbeat. “Yeah, everything's fine. Can you meet me down here?” She works in Largo or St. Pete someplace. I forget where.

3414 	“Sure. Where?”

3415 	“There's a Panera Bread close by. I'll e-mail the address.”

3416 	“Cool. So I'll see you next week. Time?”

3417 	“Can you come around eleven thirty?”

3418 	“Not a problem. See you then.” I leave a subtle invitation for small talk, even as I pull into the parking lot of WCMF, but she must be really busy.

3419 	“Okay. Bye.”

3420 	“Later, Karen.” 

3421 	We disconnect, and I walk into the studio under a giddy cloud that repeats “she wants to talk about my book.”

3422 

3423 	Last time I was in a radio station was thirty-plus years ago as a college DJ, so I have no idea

### Character Collector - Word Producer

Update the design to match the new counts_df format.

In [40]:

def process(keys, word):
    keys += [ (''.join(word[i:i+order]), word[i+order])  for i in range(0, len(word)-2) ]


In [38]:
filename = "/Compile/dwbzen/resources/text/drugBrandNames.txt"
order = 2
terminal_object = '~'
initial_object = ' '
verbose = 0

text_parser = TextParser(source=filename, ignore_case=True, maxlines=None, remove_stop_words=False)
word_counts = text_parser.get_word_counts(sort_counts=True, reverse=True)
words = text_parser.get_words()
sentence_words = text_parser.get_sentence_words()
sentences = text_parser.get_sentences()
lines = text_parser.get_lines()
line_words = text_parser.get_line_words()
print(text_parser.counts_df.head(10))
print(f'number of words: {len(words)}')
print(f'number of sentence words: {len(sentence_words)}')
print(f'number of sentences: {len(sentences)}')
print(f'number of lines: {len(lines)}')
print(f'number of line words: {len(line_words)}')


          word  count
0      trelegy      2
1       tykerb      1
2  norditropin      1
3      tresiba      1
4        zyvox      1
5       advair      1
6     beconase      1
7     delzicol      1
8    neurontin      1
9      erleada      1
number of words: 1220
number of sentence words: 1220
number of sentences: 1219
number of lines: 1219
number of line words: 1220


In [41]:
word_keys = []
for w in line_words:
    word = initial_object + w.strip() + terminal_object
    process(word_keys, word)

word_keys_df =  pd.DataFrame(data=word_keys, columns=['key','word'])
words_ser = word_keys_df.value_counts(ascending=False)
df = pd.DataFrame(words_ser, columns=['count'])
counts_df = df.reset_index()


In [42]:
counts_df

Unnamed: 0,key,word,count
0,in,~,64
1,ne,~,44
2,ra,~,43
3,p,r,39
4,pr,o,38
...,...,...,...
2854,su,l,1
2855,su,d,1
2856,if,f,1
2857,if,i,1


In [43]:
initial_keys_df = counts_df[ [x.startswith(initial_object) for x in counts_df['key']] ]
initial_keys = set(initial_keys_df['key'].values.tolist())
keys = set(counts_df['key'].values.tolist())

In [16]:
def _add_counts_to_model(model: dict, countsdf: pd.DataFrame, akey: str):
    key_df = countsdf[countsdf['key'] ==  akey]
    s = (key_df['count']/key_df['count'].sum()).cumsum()
    model[akey] = key_df.assign(prob=s )

In [39]:
markov_chain_dict = dict()
for akey in initial_keys:
    _add_counts_to_model(markov_chain_dict, counts_df, akey)

In [40]:
markov_chain = MarkovChain(order,  counts_df, chain_dict=markov_chain_dict, myname='test')
chain_df = markov_chain.chain_df

In [41]:
chain_df

Unnamed: 0,key,word,count,prob
0,e,n,14,0.191781
1,e,l,12,0.356164
2,e,p,9,0.479452
3,e,m,8,0.589041
4,e,s,6,0.671233
...,...,...,...,...
224,q,i,1,0.714286
225,q,v,1,0.785714
226,q,t,1,0.857143
227,q,s,1,0.928571


In [30]:
df = chain_df[chain_df['key'] == ' e']
df

### Collector

In [3]:
#
# CharacterCollector
#
order = 2
source = '/data/text/drugBrandNames.txt'
verbose = 1
collector = CharacterCollector(order, verbose=verbose, source=source, ignore_case=True)
collector.name = "test"
collector.format = 'csv'
collector.sort_chain = True
results=collector.run()

total number of keys: 505
number of initial keys: 26


In [4]:
markovChain = collector.markovChain
counts_df = markovChain.counts_df
chain_df = markovChain.chain_df
chain_df.head(-20)

Unnamed: 0,key,word,count,prob
0,a,b,3,0.031250
1,a,c,10,0.135417
2,a,d,12,0.260417
3,a,f,3,0.291667
4,a,g,2,0.312500
...,...,...,...,...
2834,zo,p,1,0.481481
2835,zo,r,6,0.703704
2836,zo,s,2,0.777778
2837,zo,v,1,0.814815


In [5]:
seed = 'zy'
prob = random.random()

In [6]:
print(prob)
df = chain_df[chain_df['key']==seed]
row = df[df['prob']>prob].iloc[0]
p = row['prob']
word = row['word']
print(f"prob: {p} word: '{word}'")

0.047525294755832115
prob: 0.05 word: 'c'


In [7]:
key_df = counts_df[counts_df['key']=='zo']
s = (key_df['count']/key_df['count'].sum()).cumsum()
key_df.assign(prob=s )

Unnamed: 0,key,word,count,prob
2829,zo,b,1,0.037037
2830,zo,d,1,0.074074
2831,zo,l,5,0.259259
2832,zo,m,2,0.333333
2833,zo,n,3,0.444444
2834,zo,p,1,0.481481
2835,zo,r,6,0.703704
2836,zo,s,2,0.777778
2837,zo,v,1,0.814815
2838,zo,y,1,0.851852


In [8]:
pd.read_csv("/Compile/dwbzen/resources/text/drugBrands_charCounts.csv", header=0, names=['key','word','count'])

Unnamed: 0,key,word,count
0,a,b,3
1,a,c,10
2,a,d,12
3,a,f,3
4,a,g,2
...,...,...,...
2854,zy,p,1
2855,zy,r,3
2856,zy,t,1
2857,zy,v,2


In [9]:
order = 3
word = 'aaron~'
[ (''.join(word[i:i+order]), word[i+order])  for i in range(0, len(word)-order)]

[('aar', 'o'), ('aro', 'n'), ('ron', '~')]

In [10]:
counts_df = pd.read_csv("/Compile/dwbzen/resources/text/madnessText_wordCounts.csv", header=0, names=['key','word','count'])
chain_df = pd.read_csv("/Compile/dwbzen/resources/text/madnessText_wordsChain.csv", header=0, names=['key','word','count','prob'])


In [11]:
key = 'want to'
key_df = counts_df[counts_df['key']==key]
s = (key_df['count']/key_df['count'].sum()).cumsum()

In [12]:
key_df = key_df.assign(prob=s )

In [14]:
key_df.head()

Unnamed: 0,key,word,count,prob
94743,want to,ask,4,0.076923
94744,want to,be,1,0.096154
94745,want to,bolt,1,0.115385
94746,want to,call,1,0.134615
94747,want to,come,1,0.153846
