In [1]:
from music21 import stream, interval, corpus, instrument, pitch
from music21 import converter, note, chord, environment, duration, key
import notebook
import argparse
import pandas as pd
import pathlib
from sklearn import preprocessing
import numpy as np
import json
import re

In [3]:
filename = "/Users/DWBZe/Documents/Docs/Careers/data/editions.json"
fp = open(filename, "r")
jtxt = fp.read()
editions = json.loads(jtxt)
editions

{'Hi-Tech Edition': {'version': '1.1',
  'Help': 'Rule Book v2 booklet.pdf',
  'Game Parameters': 'gameParameters.json',
  'Opportunity Cards': 'opportunityCards.json',
  'Experience Cards': 'experienceCards.json',
  'Occupations': 'occupations.json',
  'Players': 'players.json',
  'Border Squares': 'borderSquares.json',
  'Rules': 'rules.json'},
 'Destination London': {'version': '1.0',
  'Help': 'Rule Book Destination London.pdf',
  'Game Parameters': 'gameParameters.json',
  'Opportunity Cards': 'opportunityCards.json',
  'Experience Cards': 'experienceCards.json',
  'Occupations': 'occupations.json',
  'Players': 'players.json',
  'Border Squares': 'borderSquares.json',
  'Rules': 'rules.json'}}

In [4]:
print(editions.keys())
print(editions['Hi-Tech Edition'])

dict_keys(['Hi-Tech Edition', 'Destination London'])
{'version': '1.1', 'Help': 'Rule Book v2 booklet.pdf', 'Game Parameters': 'gameParameters.json', 'Opportunity Cards': 'opportunityCards.json', 'Experience Cards': 'experienceCards.json', 'Occupations': 'occupations.json', 'Players': 'players.json', 'Border Squares': 'borderSquares.json', 'Rules': 'rules.json'}


In [5]:
fp = open("/Users/DWBZe/Documents/Docs/Careers/data/gameParameters.json", "r")
jtxt = fp.read()
game_parameters = json.loads(jtxt)
game_parameters

{'starting_salary': 2000,
 'starting_cash': 2000,
 'starting_experience_cards': 0,
 'starting_opportunity_cards': 0,
 'default_game_points': 100,
 'timed_game': 0,
 'default_game_minutes': 60}

In [5]:
df = pd.DataFrame({
    'id': [0, 1, 2],
    'cats': [['A','B','C'],
             ['U','O','T'],
             ['T','C','U']]
})
df

Unnamed: 0,id,cats
0,0,"[A, B, C]"
1,1,"[U, O, T]"
2,2,"[T, C, U]"


In [9]:
#
# id  A B C U O T
#  0  1 1 1 0 0 0
#  1  0 0 0 1 1 1
#  2  0 0 1 1 0 1
exp = df.explode('cats')
exp

Unnamed: 0,id,cats
0,0,A
0,0,B
0,0,C
1,1,U
1,1,O
1,1,T
2,2,T
2,2,C
2,2,U


In [10]:
pd.crosstab(exp['id'], exp['cats'])

cats,A,B,C,O,T,U
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1,1,1,0,0,0
1,0,0,0,1,1,1
2,0,0,1,0,1,1


In [12]:
pd.get_dummies(exp['cats'])

Unnamed: 0,A,B,C,O,T,U
0,1,0,0,0,0,0
0,0,1,0,0,0,0
0,0,0,1,0,0,0
1,0,0,0,0,0,1
1,0,0,0,1,0,0
1,0,0,0,0,1,0
2,0,0,0,0,1,0
2,0,0,1,0,0,0
2,0,0,0,0,0,1


### pack and unpack

In [9]:
#
# merge 2 dictionaries
# *  is unpacking operator for iterable objects
# ** unpacks dictionaries
#
x = {'a': 1, 'b':2}
y = {'c': 3, 'd': 4}
{**x,**y}

{'a': 1, 'b': 2, 'c': 3, 'd': 4}

In [18]:
nums1 = [1,2,3,4,5]
nums2 = [6,7,8,9,10]
print(nums1)
print(*nums1)
# merge lists
nums = [*nums1, *nums2]
nums

[1, 2, 3, 4, 5]
1 2 3 4 5


[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [22]:
name = 'Don Bacon'
first, *middle, last = name
print(f'{first}\t{middle}\t{last}')

D	['o', 'n', ' ', 'B', 'a', 'c', 'o']	n


In [26]:
# pack using *
*names, = 'Don','Fred','Karen'
names

['Don', 'Fred', 'Karen']

In [42]:
def some_func(**kwargs):
    if 'reverse' in kwargs:
        v = kwargs['reverse']
        return v[len(v)::-1]

In [47]:
x = "abcdef"
some_func(reverse='Donald')

'dlanoD'

### Generators, NLTK

In [6]:
 # generator expression 
nums_squared_gc = (num**2 for num in range(5))

In [7]:
next(nums_squared_gc)

0

In [6]:
#
# reverse the digits of a positive number
#
def reverse(num):
    s = str(num)
    return int(s[len(s)::-1])

In [7]:
reverse(377821)

128773

In [5]:
import nltk
from nltk import word_tokenize, wordpunct_tokenize, regexp_tokenize
import string
# nltk.download('punkt')   # only do once
punct =  string.punctuation.replace("'", "")
punct = punct.replace('-', '')
punct = punct + '“‘’”'

def remove_punctuation(txt):
    nopunc = [char for char in txt if char not in punct]
    # Join the characters again to form the string.
    return ''.join(nopunc)

In [60]:
# generator to read large files
#
filename = "/data/text/ferlinghetti.txt"
file_lines = (row for row in open(filename, "r"))
for l in range(10):
    print(f'{l}: {next(file_lines)} ')
file_lines.close()
# next(file_lines) # throws StopIteration

0: The world is a beautiful place:
 
1: The world is a beautiful place
 
2: to be born into
 
3: if you don't mind happiness
 
4: not always being
 
5: so very much fun
 
6: if you don't mind a touch of hell
 
7: now and then
 
8: just when everything is fine
 
9: because even in heaven
 


In [61]:
filename = "/data/text/ferlinghetti.txt"
fp = open(filename, "r")
txt = fp.read()

txt = remove_punctuation(txt)

tokens = []
for l in txt.splitlines():
    print(f'line: {l}')
    tokens += l.split(' ')
tokens[:100]

line: The world is a beautiful place
line: The world is a beautiful place
line: to be born into
line: if you don't mind happiness
line: not always being
line: so very much fun
line: if you don't mind a touch of hell
line: now and then
line: just when everything is fine
line: because even in heaven
line: they don't sing
line: all the time
line: 
line: The world is a beautiful place
line: to be born into
line: if you don't mind some people dying
line: all the time
line: or maybe only starving
line: some of the time
line: which isn't half bad
line: if it isn't you
line: 
line: Oh the world is a beautiful place
line: to be born into
line: if you don't much mind
line: a few dead minds
line: in the higher places
line: or a bomb or two
line: now and then
line: in your upturned faces
line: or such other improprieties
line: as our Name Brand society
line: is prey to
line: with its men of distinction
line: and its men of extinction
line: and its priests
line: and other patrolmen
line: 
line: and

['The',
 'world',
 'is',
 'a',
 'beautiful',
 'place',
 'The',
 'world',
 'is',
 'a',
 'beautiful',
 'place',
 'to',
 'be',
 'born',
 'into',
 'if',
 'you',
 "don't",
 'mind',
 'happiness',
 'not',
 'always',
 'being',
 'so',
 'very',
 'much',
 'fun',
 'if',
 'you',
 "don't",
 'mind',
 'a',
 'touch',
 'of',
 'hell',
 'now',
 'and',
 'then',
 'just',
 'when',
 'everything',
 'is',
 'fine',
 'because',
 'even',
 'in',
 'heaven',
 'they',
 "don't",
 'sing',
 'all',
 'the',
 'time',
 '',
 'The',
 'world',
 'is',
 'a',
 'beautiful',
 'place',
 'to',
 'be',
 'born',
 'into',
 'if',
 'you',
 "don't",
 'mind',
 'some',
 'people',
 'dying',
 'all',
 'the',
 'time',
 'or',
 'maybe',
 'only',
 'starving',
 'some',
 'of',
 'the',
 'time',
 'which',
 "isn't",
 'half',
 'bad',
 'if',
 'it',
 "isn't",
 'you',
 '',
 'Oh',
 'the',
 'world',
 'is',
 'a',
 'beautiful',
 'place',
 'to']

In [36]:
print(len(txt))
tokens = word_tokenize(txt)
print(f'{len(tokens)} tokens')
print(tokens)

18342
3536 tokens
['The', 'world', 'is', 'a', 'beautiful', 'place', 'The', 'world', 'is', 'a', 'beautiful', 'place', 'to', 'be', 'born', 'into', 'if', 'you', 'do', "n't", 'mind', 'happiness', 'not', 'always', 'being', 'so', 'very', 'much', 'fun', 'if', 'you', 'do', "n't", 'mind', 'a', 'touch', 'of', 'hell', 'now', 'and', 'then', 'just', 'when', 'everything', 'is', 'fine', 'because', 'even', 'in', 'heaven', 'they', 'do', "n't", 'sing', 'all', 'the', 'time', 'The', 'world', 'is', 'a', 'beautiful', 'place', 'to', 'be', 'born', 'into', 'if', 'you', 'do', "n't", 'mind', 'some', 'people', 'dying', 'all', 'the', 'time', 'or', 'maybe', 'only', 'starving', 'some', 'of', 'the', 'time', 'which', 'is', "n't", 'half', 'bad', 'if', 'it', 'is', "n't", 'you', 'Oh', 'the', 'world', 'is', 'a', 'beautiful', 'place', 'to', 'be', 'born', 'into', 'if', 'you', 'do', "n't", 'much', 'mind', 'a', 'few', 'dead', 'minds', 'in', 'the', 'higher', 'places', 'or', 'a', 'bomb', 'or', 'two', 'now', 'and', 'then', 'in',

In [49]:
txt = "The world is a beautiful place\nto be born into\nif you don't mind happiness"
tokens = []
for l in txt.splitlines():
    tokens += l.split(' ')
print(tokens)

['The', 'world', 'is', 'a', 'beautiful', 'place', 'to', 'be', 'born', 'into', 'if', 'you', "don't", 'mind', 'happiness']


In [6]:
import pandas as pd
from markovify import split_into_sentences

class TextParser(object):
        
    def __init__(self, txt = None, maxlines=None):
        self._tokens = []
        self._lines = []
        self._sentences = []
        self._token_set = None
        self._nlines = 1
        self._token_counts = {}
        self._maxlines = maxlines
        self.words_df = None
        self.verbose = 0
        self.text = txt
        if self.text is not None and len(self.text) > 0:
            self.parse_text(self.text, maxlines=maxlines)
    
    def get_tokens(self):
        return self._tokens
    
    def get_lines(self):
        return self._lines
    
    def get_sentences(self):
        return self._sentences
    
    def size(self):
        return self._nlines
    
    def get_token_set(self):
        if self._token_set is None:
            self._set_token_set()
        return self._token_set
    
    def _set_token_set(self):
        self._token_set = set(self._tokens)
    
    def _set_token_counts(self, sort_counts=False, reverse=False):
        if len(self._token_counts) == 0:
            for w in self._token_set:
                self._token_counts |= {w:self._tokens.count(w)}
        if sort_counts:
            self._token_counts = dict(sorted(self._token_counts.items(), key=lambda item: item[1], reverse=reverse))
        self.words_df = pd.DataFrame(data=self._token_counts.items(), columns=['word','count'])
    
    def get_token_counts(self, sort_counts=False, reverse=False):
        if self._token_set is None:
            self.get_token_set()
        self._set_token_counts(sort_counts, reverse)
        return self._token_counts
    
    def parse_text(self, txt, maxlines=None):
        self.text = txt
        for l in txt.splitlines():
            if maxlines is not None and self._nlines > maxlines:
                break
            if self.verbose > 0:
                print(f'line {self.nlines}: {l}')
            ls = split_into_sentences(l)
            for s in ls:
                self._sentences.append(s)
                s_rempunc = remove_punctuation(s)
                words = s_rempunc.split(' ')
                self._tokens += [str.lower(w) for w in words if len(w) > 0]
            self._lines.append(l)
            self._nlines += 1
        self._set_token_set()
        return self._nlines


In [191]:
filename2 = "/data/text/Followed By Madness (final).txt"
fp = open(filename2, "r")
txt = fp.read()
txt = txt.replace('\t',' ')

In [197]:
tp = TextParser(txt, 10)


In [198]:
tokens = tp.get_tokens()
print(tokens)

['blackout', 'my', 'grandfather', 'appoints', 'me', 'an', 'honorary', 'electrical', 'engineer', 'for', 'niagara', 'mohawk', 'power', 'company', 'not', 'a', 'bad', 'job', 'for', 'a', 'six-year-old', 'kid', 'of', 'course', "you'll", 'need', 'training', 'he', 'tells', 'me', 'in', 'a', 'very', 'serious', 'voice', 'could', 'take', 'years', 'years', 'he', 'pulls', 'a', 'filter', 'cigarette', 'from', 'his', 'shirt', 'pocket', 'and', 'lights', 'it', 'a', 'blue', 'smoke', 'haze', 'settles', 'around', 'his', 'head', 'like', 'a', 'forlorn', 'halo', 'now', "don't", 'get', 'excited', 'he', 'says', "i'll", 'teach', 'you', 'the', 'ropes', 'he', 'snickers', 'and', 'looks', 'around', 'the', 'way', 'he', 'usually', 'does', 'when', "he's", 'about', 'do', 'something', 'that', 'will', 'annoy', 'my', 'folks', 'to', 'start', 'with', 'an', 'expert', 'pole', 'jockey', 'needs', 'one', 'of', 'these', 'he', 'rolls', 'up', 'his', 'sleeve', 'and', 'shows', 'off', 'a', 'fading', 'tattoo', 'he', 'got', 'while', 'in',

In [199]:
tc = tp.get_token_counts(sort_counts=True, reverse=True)
tc

{'a': 15,
 'the': 13,
 'he': 12,
 'i': 6,
 'of': 6,
 'to': 6,
 'and': 6,
 'his': 5,
 'my': 5,
 'me': 4,
 'in': 4,
 'from': 3,
 'around': 3,
 'pocket': 3,
 'not': 3,
 'blue': 3,
 'an': 3,
 'that': 2,
 'small': 2,
 'with': 2,
 'need': 2,
 'cigarette': 2,
 'mermaid': 2,
 'you': 2,
 'get': 2,
 'electrical': 2,
 'arm': 2,
 'do': 2,
 'years': 2,
 'says': 2,
 'it': 2,
 'on': 2,
 'tattoo': 2,
 'out': 2,
 'takes': 2,
 'voice': 2,
 'off': 2,
 'power': 2,
 "don't": 2,
 'think': 2,
 'for': 2,
 'one': 2,
 'pa': 2,
 "i'll": 2,
 'then': 2,
 'ever': 1,
 'am': 1,
 'figure': 1,
 'very': 1,
 'what': 1,
 'leather': 1,
 'smoke': 1,
 "pa's": 1,
 'would': 1,
 'say': 1,
 'dad': 1,
 'painfully': 1,
 'naked': 1,
 'usually': 1,
 'grid': 1,
 'shorts': 1,
 'into': 1,
 'older': 1,
 'pulls': 1,
 'take': 1,
 'smoldering': 1,
 'right': 1,
 'rescue': 1,
 'way': 1,
 'when': 1,
 'about': 1,
 'future': 1,
 'comes': 1,
 'something': 1,
 'ballpoint': 1,
 'will': 1,
 'suppose': 1,
 'these': 1,
 "workman's": 1,
 'plan': 1,
 '

In [200]:
tp.words_df

Unnamed: 0,word,count
0,a,15
1,the,13
2,he,12
3,i,6
4,of,6
...,...,...
190,snickers,1
191,pleading,1
192,front,1
193,wrist,1


In [10]:
filename = "/data/text/ferlinghetti.txt"
fp = open(filename, "r")
txt = fp.read()

In [11]:
tp = TextParser()
tp.parse_text(txt) #, maxlines=10)

831

In [12]:
tp.get_token_set()

{'television',
 'fifth',
 'gods',
 'ginsberg',
 'amusement',
 'junkmans',
 'tear',
 'sensations',
 'fewer',
 'best',
 'so',
 'models',
 'conference',
 'drunks',
 'after',
 'easel',
 'revival',
 'rapture',
 'cloisters',
 'church',
 'heaven',
 'in',
 'junglejims',
 'did',
 'they',
 'society',
 'my',
 'aphrodite',
 'authorlawrence',
 'fool',
 'crayfish',
 'number',
 'without',
 'thump',
 'watching',
 'breathless',
 'tender',
 'helped',
 'false',
 'sheaves',
 'coupla',
 "who's",
 'model',
 'clothes',
 'alice',
 'take',
 'barking',
 'players',
 'metro',
 'leaned',
 'dense',
 'faces',
 'am',
 'pilots',
 'behind',
 'plates',
 'des',
 'ccc',
 'wind',
 'investigating',
 'eat',
 'write',
 'listening',
 'two-step',
 'tail',
 'pointing',
 'long',
 'walking',
 'blowing',
 'south',
 'citizens',
 'were',
 'was',
 'crossed',
 'congressman',
 'asia',
 "dog's",
 'priests',
 'feet',
 'made',
 'planets',
 'illustrating',
 'easily',
 'languidly',
 'stocking',
 'cocteau',
 'died',
 'kind',
 'loona',
 'earth

In [13]:
tp.get_token_counts()

{'television': 1,
 'fifth': 2,
 'gods': 1,
 'ginsberg': 1,
 'amusement': 1,
 'junkmans': 1,
 'tear': 1,
 'sensations': 1,
 'fewer': 1,
 'best': 1,
 'so': 4,
 'models': 1,
 'conference': 1,
 'drunks': 1,
 'after': 3,
 'easel': 1,
 'revival': 1,
 'rapture': 1,
 'cloisters': 1,
 'church': 1,
 'heaven': 2,
 'in': 90,
 'junglejims': 1,
 'did': 5,
 'they': 18,
 'society': 1,
 'my': 14,
 'aphrodite': 2,
 'authorlawrence': 1,
 'fool': 1,
 'crayfish': 1,
 'number': 3,
 'without': 5,
 'thump': 1,
 'watching': 4,
 'breathless': 1,
 'tender': 1,
 'helped': 1,
 'false': 1,
 'sheaves': 1,
 'coupla': 1,
 "who's": 1,
 'model': 2,
 'clothes': 4,
 'alice': 1,
 'take': 2,
 'barking': 1,
 'players': 1,
 'metro': 1,
 'leaned': 1,
 'dense': 1,
 'faces': 1,
 'am': 75,
 'pilots': 1,
 'behind': 2,
 'plates': 1,
 'des': 1,
 'ccc': 1,
 'wind': 1,
 'investigating': 1,
 'eat': 4,
 'write': 3,
 'listening': 1,
 'two-step': 2,
 'tail': 1,
 'pointing': 1,
 'long': 5,
 'walking': 2,
 'blowing': 1,
 'south': 1,
 'citiz

In [16]:
tp.words_df.head(20)

Unnamed: 0,word,count
0,television,1
1,fifth,2
2,gods,1
3,ginsberg,1
4,amusement,1
5,junkmans,1
6,tear,1
7,sensations,1
8,fewer,1
9,best,1


In [45]:
df = pd.DataFrame(columns=['word','count', 'start_sentence','end_sentence'])
row = {'word':'Don','count':10,'start_sentence':True, 'end_sentence':True}
df

Unnamed: 0,word,count,start_sentence,end_sentence


In [53]:
df2 = pd.DataFrame.from_records(data=[row], columns=['word','count', 'start_sentence','end_sentence'])
df2

Unnamed: 0,word,count,start_sentence,end_sentence
0,Don,10,True,True


In [55]:
df = pd.concat([df,df2])
row = {'word':'Karen','count':5,'start_sentence':True, 'end_sentence':False}
df2 = pd.DataFrame.from_records(data=[row], columns=['word','count', 'start_sentence','end_sentence'])
df = pd.concat([df,df2])
df

Unnamed: 0,word,count,start_sentence,end_sentence
0,Don,10,True,True
0,Karen,5,True,False
