In [118]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
import requests
import json
from geopy import geocoders
import math
import gensim
import nltk
import difflib

In [20]:
#Get SPARK going
import os
os.environ['PYSPARK_PYTHON'] = '/anaconda/bin/python'

import findspark
findspark.init()

import pyspark
conf = (pyspark.SparkConf()
    .setMaster('local')
    .setAppName('pyspark')
    .set("spark.executor.memory", "2g"))
sc = pyspark.SparkContext(conf=conf)

import sys
from pyspark.sql import SQLContext

In [2]:
#Import seasonsDict.json which contains hometown information
with open("tempdata/listAllDicts.json") as json_file:
    seasons = json.load(json_file)

In [6]:
#Make a function that get's contestant's profession for a given season
def get_profession(choose_season):
    town_dict = {}
    for idict in seasons:
        if idict["season"] == choose_season:
            if idict["elimination"] == "bachelor":
                bachtown = idict["occupation"]
            else:
                town_dict.update({idict['name']:idict['occupation']})
    return town_dict

In [109]:
#Collect professions over all seasons
all_professions = []
for season_num in [2,4,5,6,8,9,10,11,12,13,14,15,16,17,18,19]:
    all_professions.extend(get_profession(season_num).values())

#Make profession names cleaner for processing
all_professions = map(lambda r: r.replace("&",""), all_professions)
all_professions = map(lambda r: r.replace("/"," "), all_professions)
all_professions = map(lambda r: r.replace("  "," "), all_professions)

In [110]:
#Replace words we a prior (or post-priori) know will give problems to LDA
all_professions = [u"attorney" if s=="paralegal" else s for s in all_professions]
all_professions = [u"photography" if s=="photographer" else s for s in all_professions]
all_professions = [u"gymnast" if s=="acrobat" else s for s in all_professions]
all_professions = [u"executive recruiter" if s=="IT recruiter" else s for s in all_professions]
all_professions = [u"Administrative Assistant" if s=="Assistant" else s for s in all_professions]
all_professions = [u"Cosmetics Stylist" if s=="Esthetician" else s for s in all_professions]
all_professions = [u"Songwriter singer" if s=="Singer-songwriter" else s for s in all_professions]
all_professions = [u"Medical Assistant" if s=="Medical Technician" else s for s in all_professions]
all_professions = [u"Advertising account manager" if s=="Advertising Executive" else s for s in all_professions]
all_professions = [u"gymnast" if s=="WWE Diva-in-Training" else s for s in all_professions]
all_professions = [u"doctor" if s=="physician" else s for s in all_professions]
all_professions = [u"doctor" if s=="Physician" else s for s in all_professions]

In [111]:
#Just get Nouns from professions
#Use the NLTK package to tokenize each word in each sentence
#Collect only adjectives and adverbs
all_nouns = []
for sentence in all_professions:
    stokens = nltk.word_tokenize(sentence)
    sent_noun = []
    for word, part_of_speech in nltk.pos_tag(stokens):
        if part_of_speech in ['NN', 'NNS', 'NNP', 'NNPS']:
            sent_noun.append(word)
    all_nouns.append(sent_noun)

In [112]:
#Create vocabulary using word counts
#Borrows heavily from the method of HW5

ldadatardd=sc.parallelize(all_professions).cache()
vocabtups = (ldadatardd.flatMap(lambda word: word.split(" "))
           .map(lambda word: (word, 1))
           .reduceByKey(lambda a,b: a+b)
           .map(lambda (x,y): x)
           .zipWithIndex()
           ).cache()

vocab = vocabtups.collectAsMap()
id2word = vocabtups.map(lambda (x,y): (y,x)).collectAsMap()


u'Boutique'

In [113]:
from collections import defaultdict

#Create map function that uses defaultdict
#to count how many times a word appears in sentence
def mapper_dict(row):
    d = defaultdict(int)
    for noun in row:
        d[vocab[noun]] +=1
    return d.items()

#Make corpus
prof_sentences = ldadatardd.map(lambda word: word.split(" ")).cache()
corpus = prof_sentences.map(mapper_dict).collect()

In [116]:
#Now run LDA on corpus
#We have few words so we run on the whole corpus each time
#We start with 5 different topics
lda2 = gensim.models.ldamodel.LdaModel(corpus, id2word=id2word, num_topics=10, chunksize=len(corpus), passes=20)

In [117]:
#Look at the results of LDA
for bow in corpus:
    print bow
    print lda2.get_document_topics(bow)
    print " ".join([id2word[e[0]] for e in bow])
    print "=========================================="

[(210, 1), (341, 1)]
[(0, 0.033333333391945053), (1, 0.033333333378355347), (2, 0.033336139662913149), (3, 0.033333333413611499), (4, 0.033333333386312537), (5, 0.03333333339206599), (6, 0.033333333398997071), (7, 0.69999719318800879), (8, 0.033333333393236152), (9, 0.03333333339455441)]
executive recruiter
[(52, 1), (246, 1)]
[(0, 0.033333333552362669), (1, 0.033333333501412522), (2, 0.033333333570311881), (3, 0.033333333604454451), (4, 0.033333333531448066), (5, 0.03333333354725098), (6, 0.03333333357274227), (7, 0.03333333362716949), (8, 0.69999999793059331), (9, 0.033333333562254402)]
attendant flight
[(313, 1)]
[(0, 0.050000000309972467), (1, 0.05000000023806226), (2, 0.05000000033526756), (3, 0.050000000403143931), (4, 0.05000000028029486), (5, 0.050000000302753617), (6, 0.050000000338663461), (7, 0.54999999715134984), (8, 0.050000000316779737), (9, 0.050000000323712324)]
attorney
[(242, 1), (13, 1)]
[(0, 0.033333333784314002), (1, 0.033333333679189947), (2, 0.033333333821120248)

In [120]:
#Alternative
#Cluster according to the International Standard Classification of Occupations
#https://en.wikipedia.org/wiki/International_Standard_Classification_of_Occupations
#http://www.ilo.org/public/english/bureau/stat/isco/isco08/index.htm

professions = pd.read_csv("professions.csv")
professions.head(3)


Unnamed: 0,ISCO 08 Code,Title EN
0,1,Managers
1,11,"Chief executives, senior officials and legisla..."
2,111,Legislators and senior officials


In [316]:
prof_sent = map(lambda r: r.split(" "), professions["Title EN"].tolist())
isco_code = professions["ISCO 08 Code"].tolist()
prof_list = professions["Title EN"].tolist()

def get_occupation(season_num):
    #Get names and professions
    nprof = get_profession(season_num)
    conts = nprof.keys()
    all_jobs = nprof.values()

    #Make profession names cleaner for processing
    all_jobs = map(lambda r: r.replace("&",""), all_jobs)
    all_jobs = map(lambda r: r.replace("/"," "), all_jobs)
    all_jobs = map(lambda r: r.replace("  "," "), all_jobs)

    #Replace words we a prior (or post-priori) know will give problems to cluster
    all_jobs = [u"assistant" if s=="paralegal" else s for s in all_jobs]
    all_jobs = [u"assistant" if s=="Paralegal" else s for s in all_jobs]
    all_jobs = [u"lawyer" if s=="Attorney" else s for s in all_jobs]
    all_jobs = [u"child care" if s=="Nanny" else s for s in all_jobs]
    all_jobs = [u"sales" if "merchant" in s else s for s in all_jobs]
    all_jobs = [u"dancer" if s=="Radio City Rockette" else s for s in all_jobs]
    all_jobs = [u"fashion model" if s=="Model" else s for s in all_jobs]
    all_jobs = [u"fashion model" if s=="Spokesmodel" else s for s in all_jobs]
    all_jobs = [u"Hairdressers" if s=="Hair stylist" else s for s in all_jobs]
    all_jobs = [u"lawyer" if s=="attorney" else s for s in all_jobs]
    all_jobs = [u"sports" if s=="acrobat" else s for s in all_jobs]
    all_jobs = [u"executive recruiter" if s=="IT recruiter" else s for s in all_jobs]
    all_jobs = [u"Administrative Assistant" if s=="Assistant" else s for s in all_jobs]
    all_jobs = [u"beautician" if s=="Esthetician" else s for s in all_jobs]
    all_jobs = [u"Songwriter singer" if s=="Singer-songwriter" else s for s in all_jobs]
    all_jobs = [u"Medical Assistant" if s=="Medical Technician" else s for s in all_jobs]
    all_jobs = [u"Advertising account manager" if s=="Advertising Executive" else s for s in all_jobs]
    all_jobs = [u"sports" if s=="WWE Diva-in-Training" else s for s in all_jobs]
    all_jobs = [u"doctor" if s=="physician" else s for s in all_jobs]
    all_jobs = [u"doctor" if s=="Physician" else s for s in all_jobs]
    all_jobs = [u"author" if s=="Blogger" else s for s in all_jobs]
    all_jobs = [u"chief executive" if s=="Internet entrepreneur" else s for s in all_jobs]
    all_jobs = [u"food service" if s=="VIP cocktail waitress" else s for s in all_jobs]
    all_jobs = [u"education" if ("student" in s) | ("Student" in s) else s for s in all_jobs]
    all_jobs = [u"Creative and performing artists" if "theatre" in s else s for s in all_jobs]



    all_nouns = []
    for sentence in all_jobs:
        stokens = nltk.word_tokenize(sentence)
        sent_noun = []
        for word, part_of_speech in nltk.pos_tag(stokens):
            if part_of_speech in ['NN', 'NNS', 'NNP', 'NNPS']:
                sent_noun.append(word)
        all_nouns.append(sent_noun)
    print len(all_nouns)
    print(len(all_jobs))
    
    prof_dict = {}
    all_codes = []
    all_names = []
    for iprof,sent in enumerate(all_nouns):
        for word in sent:
            for ii,profs in enumerate(prof_sent):
                wmatches = difflib.get_close_matches(word, profs, cutoff=.75)
                if wmatches:
                    #print "###########"
                    #print word
                    #print prof_list[ii]
                    #print isco_code[ii]
                    #print "###########"
                    all_names.append(prof_list[ii])
                    all_codes.append(isco_code[ii])
            
        #Get first value of codes
        fnum = map(lambda r: int(str(r)[0]), all_codes)
        counts = [fnum.count(q) for q in range(9)] #Count which code is most
        job_type = np.where(counts==np.max(counts))[0][0] + 1
        if np.sum(counts) == 0:
            job_type=999
            
        #Append jobs type
        prof_dict.update({conts[iprof] : job_type})
        all_codes=[]
        all_names=[]
        
    return prof_dict


In [308]:
"Student" in "phd Student"

True

In [270]:
words = ["Labor" ,"and" ,"delivery" ,"nurse"]
prfs = ["Pharmacists"]
dog=difflib.get_close_matches(words[0], prfs, cutoff=.9)

if dog:
    print dog

In [317]:
get_occupation(15)

30
30


{u'Alli Travis': 6,
 u'Ashley Hebert': 3,
 u'Ashley Spivey': 6,
 u'Britnee': 4,
 u'Britt Billmaier': 8,
 u"Chantal O'Brien": 4,
 u'Cristy Caserta': 3,
 u'Emily Maynard': 2,
 u'Jackie Gordon': 3,
 u'Jessica "J"': 2,
 u'Jill Ruskowski': 6,
 u'Keltie Colleen': 3,
 u'Kimberly Coon': 2,
 u'Lacey Garbelman': 2,
 u'Lauren Moore': 3,
 u'Lindsay Hill': 3,
 u'Lisa Morrisey': 2,
 u'Lisa P.': 6,
 u'Madison Garton': 6,
 u'Marissa May': 4,
 u'Meghan Merritt': 6,
 u'Melissa Schreiber': 6,
 u'Michelle Money': 6,
 u'Raichel Goodyear': 3,
 u'Rebecca': 999,
 u'Renee Halpin': 6,
 u'Sarah Ledtke': 999,
 u'Sarah Powell': 4,
 u'Shawntel Newton': 2,
 u'Stacey Quirpel': 6}

In [313]:
get_profession(15)

{u'Alli Travis': u'Apparel merchant',
 u'Ashley Hebert': u'Dental Student',
 u'Ashley Spivey': u'Nanny',
 u'Britnee': u'Paralegal',
 u'Britt Billmaier': u'Food writer',
 u"Chantal O'Brien": u'Executive Assistant',
 u'Cristy Caserta': u'Attorney',
 u'Emily Maynard': u"Children's Hospital Event Planner",
 u'Jackie Gordon': u'Artist',
 u'Jessica "J"': u'Operations Manager',
 u'Jill Ruskowski': u'Sales director',
 u'Keltie Colleen': u'Radio City Rockette',
 u'Kimberly Coon': u'Marketing Coordinator',
 u'Lacey Garbelman': u'Insurance agent',
 u'Lauren Moore': u'High School Teacher',
 u'Lindsay Hill': u'First grade teacher',
 u'Lisa Morrisey': u'Marketing Coordinator',
 u'Lisa P.': u'Sales Consultant',
 u'Madison Garton': u'Model',
 u'Marissa May': u'Sports Publicist',
 u'Meghan Merritt': u'Fashion Marketer',
 u'Melissa Schreiber': u'Waitress',
 u'Michelle Money': u'Hair stylist',
 u'Raichel Goodyear': u'Manscaper',
 u'Rebecca': u'Esthetician',
 u'Renee Halpin': u'Nanny',
 u'Sarah Ledtke': u