In [83]:
import json
import random
import re
import pathlib
import pandas as pd
from sklearn.externals import joblib
import numpy as np

In [2]:
dictionary, dict_df = joblib.load('dictionary.pkl')

In [3]:
jobs_df = pd.read_csv('jobs.csv')
jobs_df

Unnamed: 0,Job ID,Agency,Business Title,Civil Service Title,Job Description,Preferred Skills
0,86699,DEPT OF CITYWIDE ADMIN SVCS,Graphic Artist,GRAPHIC ARTIST,"Under the direct supervision, with some latitu...",2D animation skills are required but 3D animat...
1,87990,DEPARTMENT OF BUSINESS SERV.,Account Manager,CONTRACT REVIEWER (OFFICE OF L,Division of Economic & Financial Opportunity (...,•\tExcellent interpersonal and organizational ...
2,97899,DEPARTMENT OF BUSINESS SERV.,"EXECUTIVE DIRECTOR, BUSINESS DEVELOPMENT",ADMINISTRATIVE BUSINESS PROMOT,The New York City Department of Small Business...,
3,102221,DEPT OF ENVIRONMENT PROTECTION,Project Specialist,ENVIRONMENTAL ENGINEERING INTE,"Under direct supervision, perform elementary e...",
4,114352,DEPT OF ENVIRONMENT PROTECTION,Deputy Plant Chief,SENIOR STATIONARY ENGINEER (EL,"Under general direction, is in responsible cha...",
5,117261,DEPT OF ENVIRONMENT PROTECTION,CIVIL ENGINEERING INTERN,CIVIL ENGINEERING INTERN,The selected candidate will be responsible for...,
6,133921,NYC HOUSING AUTHORITY,Temporary Painter,PAINTER,Responsibilities of selected candidates will i...,
7,120749,DEPT OF ENVIRONMENT PROTECTION,"Director, Strategic Sourcing",ADMINISTRATIVE PROJECT MANAGER,The NYC Department of Environmental Protection...,- An MBA or other graduate degree potentially...
8,121583,LAW DEPARTMENT,COLLEGE AIDE,COLLEGE AIDE (ALL CITY DEPTS),Responsibilities include: Assisting with rese...,
9,124287,LAW DEPARTMENT,LAW STUDENT,STUDENT LEGAL SPECIALIST,"Under attorney supervision, the student will a...",Excellent research and writing skills.


In [4]:
special_chars = re.compile(r'[,()/$\'"*]|(- )')
whitespace = re.compile(r'\s+')
sentence_enders = re.compile(r'[.?!;•:]')
def clean_string(s):
    a = special_chars.sub(' ', s)
    b = whitespace.sub(' ', a)
    c= sentence_enders.sub('.', b)

    return c.upper()

corpi = {}



for title, group in jobs_df.groupby('Civil Service Title'):
    corpus = []
    for i,row in group.iterrows():
        for col in ['Job Description', 'Preferred Skills']:
            try:
                s = row[col].encode('windows-1252').decode('utf-8')
            except:
                s = row[col]
            s = clean_string(s)
            for sentence in s.split('.'):
                corpus.append(sentence.split())
                
    corpi[title] = corpus

In [5]:
corpus = corpi['COMPUTER SYSTEMS MANAGER']

model2 = {}

for c in corpus:
    for i in range(len(c) - 1):
        w1 = c[i]
        w2 = c[i+1]
        
        if w1 not in model2:
            model2[w1] = {}
            
        if w2 not in model2[w1]:
            model2[w1][w2] = {'count': 1, 'end': 0, 'start': 0}
        else:
            model2[w1][w2]['count'] += 1
            
        if i == (len(c) - 2):
            model2[w1][w2]['end'] += 1
        if i == 0:
            model2[w1][w2]['start'] += 1
            
records = [(w1, w2, model2[w1][w2]['count'], model2[w1][w2]['end'], model2[w1][w2]['start']) for w1 in model2 for w2 in model2[w1]]
model2_df = pd.DataFrame.from_records(records).rename(columns={0: 'word1', 1:'word2', 2:'count', 3: 'end', 4: 'start'})
model2_df = model2_df.merge(dict_df.rename(columns={0: 'word2', 1: 'syllables'}), on='word2')
model2_df = model2_df.merge(dict_df.rename(columns={0: 'word1', 1: 'syllables_word1'}), on='word1')

g = model2_df.groupby('word2')
m = g.sum().reset_index()[['word2', 'end']].merge(g.sum().reset_index()[['word2', 'count']], on='word2')
m['end_percent'] = m['end']/m['count']

model2_df = model2_df.merge(m[['word2', 'end_percent']], on='word2')


g = model2_df.groupby('word1')
m = g.sum().reset_index()[['word1', 'start']].merge(g.sum().reset_index()[['word1', 'count']], on='word1')
m['start_percent'] = m['start']/m['count']

model2_df = model2_df.merge(m[['word1', 'start_percent']], on='word1')

#model2_df.head()

model3 = {}
for c in corpus:
    for i in range(len(c) - 2):
        w1 = c[i]
        w2 = c[i+1]
        w3 = c[i+2]
        
        if w1 not in model3:
            model3[w1] = {}
            
        if w2 not in model3[w1]:
            model3[w1][w2] = {}
            
        if w3 not in model3[w1][w2]:
            model3[w1][w2][w3] = {'count': 1, 'end': 0}
        else:
            model3[w1][w2][w3]['count'] += 1
            
        if i == (len(c) - 3):
            model3[w1][w2][w3]['end'] += 1
            
records = []
for w1 in model3:
    for w2 in model3[w1]:
        for w3 in model3[w1][w2]:
            records.append((w1, w2, w3, model3[w1][w2][w3]['count'], model3[w1][w2][w3]['end']))

model3_df = pd.DataFrame.from_records(records).rename(columns={0: 'word1', 1:'word2', 2: 'word3', 3:'count', 4: 'end'})
model3_df = model3_df.merge(dict_df.rename(columns={0: 'word3', 1: 'syllables'}), on='word3')

g = model3_df.groupby('word3')
m = g.sum().reset_index()[['word3', 'end']].merge(g.sum().reset_index()[['word3', 'count']], on='word3')
m['end_percent'] = m['end']/m['count']

model3_df = model3_df.merge(m[['word3', 'end_percent']], on='word3')

model3_df.head()

Unnamed: 0,word1,word2,word3,count,end,syllables,end_percent
0,COLLEAGUES,AND,EXECUTIVE,2,0,4,0.0
1,BY,THE,EXECUTIVE,1,0,4,0.0
2,CONJUNCTION,WITH,EXECUTIVE,2,0,4,0.0
3,OWNERS,AND,EXECUTIVE,1,0,4,0.0
4,SEEKS,AN,EXECUTIVE,1,0,4,0.0


In [91]:
def uppercase(matchobj):
    return matchobj.group(0).upper()

def capitalize(s):
    return re.sub('^([a-z])|[\.|\?|\!]\s*([a-z])|\s+([a-z])(?=\.)', uppercase, s)

#haiku = [[]]
#counts = [5, 7, 5]

def get_first_word():
    subset = model2_df[(model2_df['syllables_word1'] <= 5) & (model2_df['start_percent'] > .1)]
    w = subset.sample(n=1).iloc[0]
    return {'word': w['word1'], 'syllables': w['syllables_word1']}
    #return w#['word2'], w['syllables']

def get_word(previous_words, remaining, line, tried_words):
    if len(previous_words) >= 2:
        subset = model3_df[
            (model3_df['word1'] == previous_words[-2]['word']) &
            (model3_df['word2'] == previous_words[-1]['word']) & 
            (model3_df['syllables'] <= remaining) &
            (~model3_df['word3'].isin(tried_words))
        ]
        
        if line == 2:
            subset = subset[(subset['syllables'] < remaining) | (subset['end_percent'] > .2)]
            
        if len(subset) == 0:
            return get_word([previous_words[-1]], remaining, line)
        
        w = subset.sample(n=1, weights='count').iloc[0]
        
        return {'word': w['word3'], 'syllables': w['syllables']}
    else:
        subset = model2_df[
            (model2_df['word1'] == previous_words[-1]['word']) &
            (model2_df['syllables'] <= remaining) &
            (~model2_df['word2'].isin(tried_words))
        ]
        
        if line == 2:
            subset = subset[(subset['syllables'] < remaining) | (subset['end_percent'] > .1)]

        w = subset.sample(n=1, weights='count').iloc[0]

        return {'word': w['word2'], 'syllables': w['syllables']}

import copy
import time

delete_n_pattern = np.ceil(11 - np.logspace(1,0)).astype(int)

def generate_haiku():
    #w = model_df[model_df['syllables']<=5].sample(n=1).iloc[0]
    path = []
    w = get_first_word()
    #previous_word = w#['word2']
    previous_words = [w]
    haiku = [[w], [], []]
    #path.append(copy.deepcopy(haiku))
    path.append(w['word'])
    counts = [5 - w['syllables'], 7, 5]
    delete_n = 0
    tried_words = []
    
    i = 0
    #for i,l in enumerate(counts):
    while i < len(counts):
        #remaining = counts[i]
        
        while counts[i] > 0:
            #print(haiku, counts)
            #print(path)
            #time.sleep(.5)
            try:
                w = get_word(previous_words, counts[i], i, tried_words)
                path.append(w['word'])
                previous_words.append(w)
                haiku[i].append(w)
                #path.append(copy.deepcopy(haiku))
                counts[i] -= w['syllables']
                tried_words = []
            except Exception as e:
                #raise e
                #print(delete_n_pattern[delete_n])
                for j in range(delete_n_pattern[delete_n]):
                    if len(haiku[i]) == 0:
                        i -= 1
                        #remaining = counts[i]
                        path.append('i--')
                        
                    if i == -1:
                        raise IndexError
                        
                    previous = haiku[i].pop()
                    path.append('-' + previous['word'])
                    previous_words.pop()
                    counts[i] += previous['syllables']
                    tried_words.append(previous['word'])
                delete_n += 1
        path.append('i++')
        i += 1


    print(capitalize("\n".join([" ".join([w['word'] for w in l]) for l in haiku]).lower()))
    print([i.lower() for i in path])
    
    return path
    

paths = []
for i in range(100):
    generated = False
    while not generated:
        try:
            paths.append(generate_haiku())
            generated = True
        except IndexError:
            #print('failed!')
            pass
    print()

#for i in paths:
#    print(i)
#generate_haiku()

#generate_haiku()

Driving improvements
to the management and staff
to ensure projects
['driving', 'improvements', 'i++', 'to', 'the', 'management', 'and', 'staff', 'i++', 'to', 'ensure', 'projects', 'i++']

Fact team manager
is expected to forge strong
working knowledge skills
['fact', 'team', 'manager', 'i++', 'is', 'expected', 'to', 'build', 'new', 'i++', 'digital', 'and', '-and', '-digital', 'i--', '-new', 'all', 'i++', 'i--', '-all', '-build', 'provide', 'i++', 'a', 'high', 'level', '-level', '-high', '-a', 'leadership', 'and', '-and', '-leadership', 'i--', '-provide', 'forge', 'strong', 'i++', 'working', 'knowledge', 'skills', 'i++']

Manage a small team
to gather requirements for
data sharing needs
['manage', 'a', 'small', 'team', 'i++', 'to', 'gather', 'requirements', 'for', 'i++', 'data', 'sharing', 'needs', 'i++']

Plan and schedule for
project management office
who will be assigned
['plan', 'and', 'schedule', 'for', 'i++', 'project', 'program', '-program', 'management', 'office', 'i++', 'who',

The pace of work with
data modeling design
patterns such as click
['the', 'build', 'it', 'back', 'case', 'i++', 'management', 'system', 'in', '-in', 'data', 'i++', 'integration', '-integration', 'i--', '-data', 'in', '-in', '-system', 'and', 'staff', 'to', 'use', 'i++', 'information', '-information', 'i--', '-use', '-to', '-staff', '-and', '-management', 'i--', '-case', '-back', '-it', '-build', 'pace', 'of', 'work', 'and', 'i++', 'manage', 'time', 'according', 'to', 'i++', 'schedules', 'and', 'leads', '-leads', '-and', '-schedules', 'i--', '-to', '-according', '-time', '-manage', 'i--', '-and', 'with', 'i++', 'data', 'modeling', 'design', 'i++', 'patterns', 'such', 'as', 'click', 'i++']

Strong presence to play
an integral role in the
build it back program
['strong', 'presence', 'to', 'play', 'i++', 'an', 'integral', 'role', 'in', 'the', 'i++', 'build', 'it', 'back', 'program', 'i++']

The project teams and
vendors to design data
models and flowcharts
['the', 'project', 'teams', 'and'

Preferred candidate
is responsible for the
build it back program
['preferred', 'candidate', 'i++', 'is', 'responsible', 'for', 'one', 'i++', 'or', 'more', 'test', 'teams', '-teams', '-test', '-more', '-or', 'i--', '-one', 'the', 'i++', 'build', 'it', 'back', 'program', 'i++']

Experience on
reporting tools including
project plans timelines
['experience', 'on', 'i++', 'reporting', 'tools', 'including', 'i++', 'project', 'plans', 'timelines', 'i++']

Preferred candidate
is responsible for the
following systems
['preferred', 'candidate', 'i++', 'should', 'possess', 'the', 'following', 'i++', 'new', 'systems', 'and', '-and', '-systems', '-new', 'criteria', '-criteria', 'i--', '-following', '-the', '-possess', '-should', 'is', 'responsible', 'for', 'the', 'i++', 'following', 'systems', 'i++']

Coordinate the
participation of all
city agencies
['coordinate', 'the', 'i++', 'participation', 'of', 'all', 'i++', 'city', 'agencies', 'i++']

Producing ad-hoc
queries and a number of
third party pro

Experience as
a research and project or
change management skills
['experience', 'in', 'i++', 'testing', 'large', '-large', '-testing', 'i--', '-in', 'as', 'i++', 'a', 'research', 'and', 'project', 'costs', 'i++', 'using', '-using', 'i--', '-costs', 'or', 'i++', 'change', 'management', 'skills', 'i++']

We are looking for
a computer systems and
network and systems
['we', 'also', '-also', 'are', 'looking', 'for', 'i++', 'a', 'computer', 'systems', 'and', 'i++', 'services', 'and', '-and', '-services', 'network', 'and', 'systems', 'i++']

Strong knowledge and or
approaches to new yorkers
and communities
['strong', 'knowledge', 'and', 'or', 'i++', 'approaches', 'to', 'problems', '-problems', 'new', 'york', 'and', 'i++', 'has', 'the', '-the', '-has', 'i--', '-and', '-york', 'yorkers', 'i++', 'and', 'communities', 'i++']

Leading the user
experience design and
construction program
['leading', 'the', 'user', 'i++', 'experience', 'design', 'and', 'i++', 'manage', 'desktops', '-desktops', 'the',

In [None]:
for i in paths:
    print(i)
    #print(i)
    #print()

In [None]:
model3_df[model3_df['word3'] == 'COMBINED']

In [None]:
model3_df['weighted_end_percent'] = model3_df['end_percent']/model3_df['count'].sum()
model3_df

In [None]:
model3_df.sort_values('weighted_end_percent')

In [None]:
model2_df.sort_values('count')

In [None]:
for i in paths:
    print(i)
    #print(i)
    #print()

In [76]:
import numpy as np
(np.logspace(.6,1)/2).astype(int)

array([1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 5])

In [77]:
1/np.logspace(0,1)

array([ 1.        ,  0.95409548,  0.91029818,  0.86851137,  0.82864277,
        0.79060432,  0.75431201,  0.71968567,  0.68664885,  0.65512856,
        0.62505519,  0.59636233,  0.5689866 ,  0.54286754,  0.51794747,
        0.49417134,  0.47148664,  0.44984327,  0.42919343,  0.40949151,
        0.39069399,  0.37275937,  0.35564803,  0.33932218,  0.32374575,
        0.30888436,  0.29470517,  0.28117687,  0.26826958,  0.25595479,
        0.24420531,  0.23299518,  0.22229965,  0.21209509,  0.20235896,
        0.19306977,  0.184207  ,  0.17575106,  0.16768329,  0.15998587,
        0.1526418 ,  0.14563485,  0.13894955,  0.13257114,  0.12648552,
        0.12067926,  0.11513954,  0.10985411,  0.10481131,  0.1       ])

In [82]:
np.ceil(11 - np.logspace(1,0)).astype(int)

array([ 1,  2,  2,  3,  3,  4,  4,  4,  5,  5,  5,  6,  6,  6,  6,  7,  7,
        7,  7,  7,  8,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9,
        9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10])