In [1]:
import json
import random
import re
import pathlib
import pandas as pd
from sklearn.externals import joblib
import numpy as np
import copy
import time
import traceback

In [2]:
dictionary, dict_df = joblib.load('dictionary.pkl')

In [3]:
jobs_df = pd.read_csv('jobs.csv')
jobs_df

Unnamed: 0,Job ID,Agency,Business Title,Civil Service Title,Job Description,Preferred Skills
0,86699,DEPT OF CITYWIDE ADMIN SVCS,Graphic Artist,GRAPHIC ARTIST,"Under the direct supervision, with some latitu...",2D animation skills are required but 3D animat...
1,87990,DEPARTMENT OF BUSINESS SERV.,Account Manager,CONTRACT REVIEWER (OFFICE OF L,Division of Economic & Financial Opportunity (...,•\tExcellent interpersonal and organizational ...
2,97899,DEPARTMENT OF BUSINESS SERV.,"EXECUTIVE DIRECTOR, BUSINESS DEVELOPMENT",ADMINISTRATIVE BUSINESS PROMOT,The New York City Department of Small Business...,
3,102221,DEPT OF ENVIRONMENT PROTECTION,Project Specialist,ENVIRONMENTAL ENGINEERING INTE,"Under direct supervision, perform elementary e...",
4,114352,DEPT OF ENVIRONMENT PROTECTION,Deputy Plant Chief,SENIOR STATIONARY ENGINEER (EL,"Under general direction, is in responsible cha...",
5,117261,DEPT OF ENVIRONMENT PROTECTION,CIVIL ENGINEERING INTERN,CIVIL ENGINEERING INTERN,The selected candidate will be responsible for...,
6,133921,NYC HOUSING AUTHORITY,Temporary Painter,PAINTER,Responsibilities of selected candidates will i...,
7,120749,DEPT OF ENVIRONMENT PROTECTION,"Director, Strategic Sourcing",ADMINISTRATIVE PROJECT MANAGER,The NYC Department of Environmental Protection...,- An MBA or other graduate degree potentially...
8,121583,LAW DEPARTMENT,COLLEGE AIDE,COLLEGE AIDE (ALL CITY DEPTS),Responsibilities include: Assisting with rese...,
9,124287,LAW DEPARTMENT,LAW STUDENT,STUDENT LEGAL SPECIALIST,"Under attorney supervision, the student will a...",Excellent research and writing skills.


In [4]:
special_chars = re.compile(r'[,()/$\'"*]|(- )')
whitespace = re.compile(r'\s+')
sentence_enders = re.compile(r'[.?!;•:]')
def clean_string(s):
    s = s.replace('IT', 'I-T').replace('I.T.', 'I-T')
    a = special_chars.sub(' ', s)
    b = whitespace.sub(' ', a)
    c= sentence_enders.sub('.', b)

    return c.upper()

corpi = {}



for title, group in jobs_df.groupby('Civil Service Title'):
    corpus = []
    for i,row in group.iterrows():
        for col in ['Job Description', 'Preferred Skills']:
            try:
                s = row[col].encode('windows-1252').decode('utf-8')
            except:
                s = row[col]
            s = clean_string(s)
            for sentence in s.split('.'):
                corpus.append(sentence.split())
                
    corpi[title] = corpus

In [5]:
models = {}

for key,corpus in corpi.items():
    model2 = {}

    for c in corpus:
        for i in range(len(c) - 1):
            w1 = c[i]
            w2 = c[i+1]

            if w1 not in model2:
                model2[w1] = {}

            if w2 not in model2[w1]:
                model2[w1][w2] = {'count': 1, 'end': 0, 'start': 0}
            else:
                model2[w1][w2]['count'] += 1

            if i == (len(c) - 2):
                model2[w1][w2]['end'] += 1
            if i == 0:
                model2[w1][w2]['start'] += 1

    records = [(w1, w2, model2[w1][w2]['count'], model2[w1][w2]['end'], model2[w1][w2]['start']) for w1 in model2 for w2 in model2[w1]]
    model2_df = pd.DataFrame.from_records(records).rename(columns={0: 'word1', 1:'word2', 2:'count', 3: 'end', 4: 'start'})
    model2_df = model2_df.merge(dict_df.rename(columns={0: 'word2', 1: 'syllables'}), on='word2')
    model2_df = model2_df.merge(dict_df.rename(columns={0: 'word1', 1: 'syllables_word1'}), on='word1')

    g = model2_df.groupby('word2')
    m = g.sum().reset_index()[['word2', 'end']].merge(g.sum().reset_index()[['word2', 'count']], on='word2')
    m['end_percent'] = m['end']/m['count']

    model2_df = model2_df.merge(m[['word2', 'end_percent']], on='word2')


    g = model2_df.groupby('word1')
    m = g.sum().reset_index()[['word1', 'start']].merge(g.sum().reset_index()[['word1', 'count']], on='word1')
    m['start_percent'] = m['start']/m['count']

    model2_df = model2_df.merge(m[['word1', 'start_percent']], on='word1')

    model3 = {}
    for c in corpus:
        for i in range(len(c) - 2):
            w1 = c[i]
            w2 = c[i+1]
            w3 = c[i+2]

            if w1 not in model3:
                model3[w1] = {}

            if w2 not in model3[w1]:
                model3[w1][w2] = {}

            if w3 not in model3[w1][w2]:
                model3[w1][w2][w3] = {'count': 1, 'end': 0}
            else:
                model3[w1][w2][w3]['count'] += 1

            if i == (len(c) - 3):
                model3[w1][w2][w3]['end'] += 1

    records = []
    for w1 in model3:
        for w2 in model3[w1]:
            for w3 in model3[w1][w2]:
                records.append((w1, w2, w3, model3[w1][w2][w3]['count'], model3[w1][w2][w3]['end']))

    model3_df = pd.DataFrame.from_records(records).rename(columns={0: 'word1', 1:'word2', 2: 'word3', 3:'count', 4: 'end'})
    model3_df = model3_df.merge(dict_df.rename(columns={0: 'word3', 1: 'syllables'}), on='word3')

    g = model3_df.groupby('word3')
    m = g.sum().reset_index()[['word3', 'end']].merge(g.sum().reset_index()[['word3', 'count']], on='word3')
    m['end_percent'] = m['end']/m['count']

    model3_df = model3_df.merge(m[['word3', 'end_percent']], on='word3')
    
    print(key, len(corpus), len(model2), len(model2_df), len(model3), len(model3_df))
    
    models[key] = (model2_df, model3_df)

FORESTER 70 137 206 125 192
CASHIER 156 225 384 210 378
ASSOCIATE STAFF ANALYST 953 1589 4622 1542 5764
SECRETARY 33 80 114 69 99
ASSISTANT COMMISSIONER (BUILDI 44 261 433 249 457
PUBLIC HEALTH ADVISER 476 632 1340 598 1637
CONFIDENTIAL AGENCY INVESTIGAT 44 277 440 262 449
DIRECTOR OF PUBLIC & COMMUNITY 92 241 325 230 338
SPECIAL EXAMINER (DCAS) 151 564 1126 550 1258
ADM INSPECTOR (BUILDINGS) NM 102 316 587 301 652
PUBLIC HEALTH SANITARIAN 493 608 1144 584 1287
EMERGENCY PREPAREDNESS SPECIAL 586 879 2207 838 2639
ADMIN ARCHITECT (NON MGRL) M-1 57 284 499 268 502
MOTOR VEHICLE OPERATOR 28 166 268 155 268
ASSISTANT CIVIL ENGINEER 1062 1237 3407 1186 4243
RESEARCH PROJECTS COORDINATOR 360 658 1542 628 1759
LETTERER AND SIGN PAINTER 11 101 138 99 139
MANAGEMENT AUDITOR 37 275 470 263 490
STRATEGIC INITIATIVE SPECIALIS 325 804 1893 768 2127
WATERSHED MAINTAINER 246 448 846 423 929
IT PROJECT SPECIALIST 219 565 1200 541 1365
MECHANICAL ENGINEERING INTERN 66 269 447 257 464
 ANALYSTS 8 9 9 7 

ASSISTANT HIGHWAY TRANSPORTATI 73 368 638 354 680
ADMIN CONTRACT SPECIALIST (PYR 607 1254 3301 1190 3985
HOUSING MANAGER (HA) 98 253 420 236 424
ADMINISTRATIVE MANAGER 121 592 1137 566 1239
GRAPHIC ARTIST 124 649 1307 620 1452
*ATTORNEY AT LAW 60 117 189 112 193
ADMINISTRATIVE JOB OPPORTUNITY 125 294 524 282 571
SENIOR INTERGROUP RELATIONS OF 29 94 139 89 143
CERT IT ADMINISTRATOR (DB) 413 669 1434 635 1622
ELECTRICAL ENGINEER 428 738 1699 717 1979
INVESTMENT ANALYST 238 403 847 390 936
EXECUTIVE ASSISTANT TO THE COM 43 140 233 131 236
SR HEALTHCARE PROG PLAN ANLYST 99 286 456 275 538
SENIOR ESTIMATOR (GENERAL CONS 80 307 532 294 537
EMERGENCY PREPAREDNESS MANAGER 32 113 167 104 172
INSTRUMENTATION SPECIALIST 223 674 1334 643 1534
MEDICOLEGAL ANALYST (LAW DEPT) 7 68 79 66 79
TELECOMMUNICATIONS ASSOCIATE ( 332 806 1624 764 1846
 ANALYST (ATTORNEY) 8 8 8 6 6
CRIMINALIST ASSISTANT DIRECTOR 50 214 352 209 397
DEPUTY ASSISTANT DIRECTOR (CIV 117 344 529 322 537
CRIMINALIST 655 519 996 494 11

DIRECTOR OF CONSUMER INFORMATI 53 232 405 218 417
ENGINEERING TECHNICIAN 374 547 1101 526 1217
CITY PLANNER 735 1157 3119 1118 3888
SUPERVISOR (PEST CONTROL) 12 112 161 105 162
PARALEGAL AIDE 229 536 1106 506 1196
IT SECURITY SPECIALIST 392 659 1396 628 1555


In [12]:
def uppercase(matchobj):
    return matchobj.group(0).upper()

def capitalize(s):
    return re.sub('^([a-z])|[\.|\?|\!]\s*([a-z])|\s+([a-z])(?=\.)', uppercase, s)

def get_first_word(model2_df):
    subset = model2_df[(model2_df['syllables_word1'] <= 5) & (model2_df['start_percent'] > .1)]
    w = subset.sample(n=1).iloc[0]
    return {'word': w['word1'], 'syllables': w['syllables_word1']}

def get_word(previous_words, remaining, line, tried_words, model2_df, model3_df):
    if len(previous_words) >= 2:
        subset = model3_df[
            (model3_df['word1'] == previous_words[-2]['word']) &
            (model3_df['word2'] == previous_words[-1]['word']) & 
            (model3_df['syllables'] <= remaining) &
            (~model3_df['word3'].isin(tried_words))
        ]
        
        if line == 2:
            subset = subset[(subset['syllables'] < remaining) | (subset['end_percent'] > .2)]
            
        if len(subset) == 0:
            return get_word([previous_words[-1]], remaining, line)
        
        w = subset.sample(n=1, weights='count').iloc[0]
        
        return {'word': w['word3'], 'syllables': w['syllables']}
    else:
        subset = model2_df[
            (model2_df['word1'] == previous_words[-1]['word']) &
            (model2_df['syllables'] <= remaining) &
            (~model2_df['word2'].isin(tried_words))
        ]
        
        if line == 2:
            subset = subset[(subset['syllables'] < remaining) | (subset['end_percent'] > .1)]

        w = subset.sample(n=1, weights='count').iloc[0]

        return {'word': w['word2'], 'syllables': w['syllables']}



delete_n_pattern = np.ceil(11 - np.logspace(1,0)).astype(int)

def generate_haiku(model2_df, model3_df):
    path = []
    w = get_first_word(model2_df)
    previous_words = [w]
    haiku = [[w], [], []]
    path.append(w['word'])
    counts = [5 - w['syllables'], 7, 5]
    delete_n = 0
    tried_words = []
    
    i = 0
    while i < len(counts): 
        while counts[i] > 0:
            try:
                w = get_word(previous_words, counts[i], i, tried_words, model2_df, model3_df)
                path.append(w['word'])
                previous_words.append(w)
                haiku[i].append(w)
                counts[i] -= w['syllables']
                tried_words = []
            except Exception as e:
                for j in range(delete_n_pattern[delete_n]):
                    if len(haiku[i]) == 0:
                        i -= 1
                        path.append('i--')
                        
                    if i == -1:
                        raise IndexError
                        
                    previous = haiku[i].pop()
                    path.append('-' + previous['word'])
                    previous_words.pop()
                    counts[i] += previous['syllables']
                    tried_words.append(previous['word'])
                delete_n += 1
        path.append('i++')
        i += 1


    #print(capitalize("\n".join([" ".join([w['word'] for w in l]) for l in haiku]).lower()))
    #print([i.lower() for i in path])
    
    #return haiku, path
    return capitalize("\n".join([" ".join([w['word'] for w in l]) for l in haiku]).lower()), [i.lower() for i in path]

def generate_haikus(key, model2_df, model3_df):
    results = []
    for i in range(2):
        generated = False
        tries = 0
        while not generated:
            try:
                #paths.append(generate_haiku())
                haiku, path = generate_haiku(model2_df, model3_df)
                results.append((key, haiku, path))
                #print(haiku)
                #print(path)
                #print()
                generated = True
            except IndexError:
                tries += 1
                if tries > 100:
                    return results
                
    return results
    

results = []
for key, (model2_df, model3_df) in models.items():
    results += generate_haikus(key, model2_df, model3_df)
    print(key)
    
    #print(key, len(model2_df), len(model3_df))
    
    


ASSOCIATE FIRE PROTECTION INSP
ASSOCIATE PUBLIC HEALTH SANITA
PLUMBER
ADMINISTRATIVE PUBLIC INFORMAT
DIRECTOR OF FIELD OPERATIONS (
SUPERVISING BLASTING INSPECTOR
FORESTER
STEAM FITTER'S HELPER
ASSISTANT CORPORATION COUNSEL
PROTECTION AGENT (ACS)
ASSISTANT HIGHWAY TRANSPORTATI
COMMUNITY COORDINATOR
JR PHNURSE (SCH HLTH)>20 HR PT
CONSTRUCTION PROJECT MANAGER I
ADMINISTRATIVE DIRECTOR OF RES
HOUSING MANAGER (HA)
SENIOR STATIONARY ENGINEER (EL
ADMINISTRATIVE CLAIM EXAMINER
CITY PLANNING TECHNICIAN
COMPUTER AIDE
ADMINISTRATIVE GRAPHIC ARTIST
PEST CONTROL AIDE
CONSULTANT (PUBLIC HEALTH-SOCI
CERT IT ADMINISTRATOR (DB)
 ANALYST
QUALITY ASSURANCE SPECIALIST (
COMPUTER ASSOC (TECH SUPP)
Asst Comm-Prgm Dev Revw-HMH
CUSTOMER INFORMATION REP
CITY DEPUTY MEDICAL DIRECTOR
STAFF NURSE
PROCUREMENT ANALYST
COMPUTER PROGRAMMER ANALYST
BAKER
CEMENT MASON
TELECOMMUNICATION MANAGER
ASSISTANT COMMISSIONER (DBS)
BRIDGE OPERATOR
HEATING PLANT TECHNICIAN (HA)
CONSTRUCTION PROJECT MANAGER
SENIOR ESTIMATOR (GENE

STAFF ANALYST
ADMINISTRATIVE PROCUREMENT ANA
ASSISTANT COMMISSIONER (DEP)
SECRETARY
BUSINESS PROMOTION COORDINATOR
SENIOR FIELD SUPERVISOR (SUMME
CARETAKER (HA)
ASSOCIATE URBAN DESIGNER
COMMUNITY SERVICE AIDE
GASOLINE ROLLER ENGINEER
DIRECTOR OF NEIGHBORHOOD CONSE
COMMUNITY ASSOCIATE
CERTIFIED IT ADMINISTRATOR (LA
CLERICAL AIDE
ACCOUNTANT
CONFIDENTIAL STRATEGY PLANNER
CITY MEDICAL SPECIALIST (PART-
 UNIT HEAD
ADMINISTRATIVE  PROGRAM OFFICE
ADM ENGINEER (NON MGRL)
POLICE ADMINISTRATIVE AIDE
SPECIAL EXAMINER (DCAS)
PORT MARINE ENGINEER
SENIOR INTERGROUP RELATIONS OF
ASSOCIATE FRAUD INVESTIGATOR (
ADMINISTRATIVE QUALITY ASSURAN
CORRECTIONAL STANDARDS REVIEW
CERTIFIED IT ADMINISTRATOR LAN
ASSOCIATE INSPECTOR (CONSTRUCT
MOTOR VEHICLE SUPERVISOR
EXTERMINATOR
PROJECT MANAGER INTERN#
COMPUTER SPECIALIST (SOFTWARE)
SUPERVISOR (PEST CONTROL)
HIGH PRESSURE PLANT TENDER
LABORATORY ASSOCIATE
PUBLIC RELATIONS ADVISER
SPECIAL INVESTIGATOR
PUBLIC HEALTH ADVISER (SCHOOL
ADMINISTRATIVE DIRECTOR OF SOC
S

In [13]:
results_df = pd.DataFrame.from_records(results, columns=['title', 'haiku', 'path']).drop_duplicates(subset=['haiku'])
results_df.to_csv('results_new.csv', index=False)

In [18]:
models['ASSOCIATE INVESTIGATOR (NOT PR'][0].sort_values('start_percent')

Unnamed: 0,word1,word2,count,end,start,syllables,syllables_word1,end_percent,start_percent
0,VOLUNTEER,AND,2,0,0,1,3,0.000000,0.000000
125,RESPONDING,TO,2,0,0,1,3,0.000000,0.000000
126,SENSITIVITY,TO,2,0,0,1,5,0.000000,0.000000
127,IS,TO,2,0,0,1,1,0.000000,0.000000
129,PUBLIC,TRANSPORTATION,2,0,0,4,2,0.000000,0.000000
130,SUPERB,INTERPERSONAL,2,0,0,5,2,0.000000,0.000000
131,A,WIDE,2,0,0,1,1,0.000000,0.000000
132,A,BROAD,2,0,0,1,1,0.000000,0.000000
133,A,LIAISON,2,0,0,3,1,0.000000,0.000000
134,MULTIPLE,STAKEHOLDERS,2,0,0,3,3,0.000000,0.000000


In [17]:
models['ASSOCIATE INVESTIGATOR (NOT PR'][1]

Unnamed: 0,word1,word2,word3,count,end,syllables,end_percent
0,VOLUNTEER,AND,YOUTH,2,0,1,0.0
1,DYCD,SUPPORT,YOUTH,2,0,1,0.0
2,TO,VULNERABLE,YOUTH,2,0,1,0.0
3,DURING,THE,YOUTH,4,0,1,0.0
4,OF,THE,YOUTH,2,0,1,0.0
5,DEPARTMENT,OF,YOUTH,2,0,1,0.0
6,POSITIVELY,IMPACT,YOUTH,2,0,1,0.0
7,PROVIDERS,AND,YOUTH,2,0,1,0.0
8,FOR,2018,YOUTH,2,0,1,0.0
9,CENTRAL,TASK,IS,2,0,1,0.0
