In [1]:
import numpy as np
import pandas as pd

from datetime import datetime as dt
import random
import re

from tqdm import tqdm

from itertools import chain
from collections import Counter

In [2]:
data = pd.read_pickle("../../Data/app_opp.pickle")
data.sample(4)

Unnamed: 0,ApplicationId,OpportunityId,Title,ExternalDescription,WhatYouDid,dateAppliedAt,JobTitle,LicenseAndCertification,skill,major,degree
48797,vIMeeUaRMkSMFXFG3tmWiA==,yWgqsBogkEOuTYIsvNdlJA==,"ACNO Laredo Medical Center - Laredo, TX",<p><em>Laredo Medical Center is Laredo&rsquo;s...,\nUHS South Texas Health System McAllen Medica...,2022-10-05 07:20:01.253,Director Emergency Department,NRP,Pharmacy,Nursing,Post Graduate Studies Cardiovascular
31917,V6zWhoVMx02P9HqZkb2Q1Q==,CHEsCAwyeEmSxpNZy4Qf0A==,Teller,<table>\n<tbody>\n<tr>\n<td>\n<p><strong>Telle...,"- Present, promote and sell products/services ...",2019-08-03 07:03:55.250,Sales representative,IGETC,2nd language,International Business,Bachelors
63978,TTegDaiHQEeJBdrIOFuzIQ==,vepl7eCMr0mzIomfOnUkZQ==,Development Coach,<p><strong>Department: Player Development</str...,• Responsible for on and off-site consulting s...,2021-11-10 10:49:10.430,Consultant,FACHE,Cooking,,Certificate in Lean Healthcare Management Prin...
5938,vIS6V+bJqUCqqMZl9jHB3g==,sSrD6wXvKUSVhXKn3nHO3A==,Contract Administrator,<p><strong>Job Summary</strong></p>\n<p>BioLeg...,Assisted in drafting and negotiating of commer...,2019-05-29 01:32:28.292,Paralegal,Paralegal Certificate,Vendor agreements,Law,Certificate


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   ApplicationId            70000 non-null  object        
 1   OpportunityId            70000 non-null  object        
 2   Title                    70000 non-null  object        
 3   ExternalDescription      70000 non-null  object        
 4   WhatYouDid               65755 non-null  object        
 5   dateAppliedAt            70000 non-null  datetime64[ns]
 6   JobTitle                 70000 non-null  object        
 7   LicenseAndCertification  70000 non-null  object        
 8   skill                    70000 non-null  object        
 9   major                    43059 non-null  object        
 10  degree                   70000 non-null  object        
dtypes: datetime64[ns](1), object(10)
memory usage: 5.9+ MB


---

<b>Fields related to opportunities:</b>
- OpportunityId
- Title
- ExternalDescirption

<b>Fields related to applications:</b>
- ApplicationId
- dateAppliedAt
- JobTitle
- WhatYouDid(free form text written by applicants and it serves as a summary of the work they did, they can fill it however they like)
- LicenseAndCertification
- major
- degree
- skill

In [4]:
data.nunique()

ApplicationId               175
OpportunityId               132
Title                       124
ExternalDescription         129
WhatYouDid                  661
dateAppliedAt               175
JobTitle                    612
LicenseAndCertification     247
skill                      1126
major                        68
degree                      195
dtype: int64

In [5]:
data.groupby('OpportunityId',as_index=False).nunique()['ApplicationId'].sum() # Confirming data structure

175

We're looking at 175 applications to 132 posts. A few postings have identical Titles to others. Even fewer have identical descriptions (reposts)

In [6]:
data.groupby('OpportunityId', as_index=False).nunique().sort_values(by='ApplicationId', ascending=False)

Unnamed: 0,OpportunityId,ApplicationId,Title,ExternalDescription,WhatYouDid,dateAppliedAt,JobTitle,LicenseAndCertification,skill,major,degree
78,dmlXNFI0MEOlW+qh07E4iQ==,17,1,1,86,17,88,30,141,16,39
123,wPH/PTSFV0SBmSzZlWnftA==,4,1,1,16,4,15,7,43,2,7
5,4xZy2W9hJkG3QJH7L/yLxw==,4,1,1,3,4,5,4,22,3,8
77,dFx5AVFlUUqQ5MW3Z0oQwg==,3,1,1,11,3,11,6,76,3,8
75,cn146CawdUKr/7Ha4/tMpA==,3,1,1,10,3,12,4,56,2,8
...,...,...,...,...,...,...,...,...,...,...,...
41,KURKW3m+Ik2y6BNjhBa2uA==,1,1,1,9,1,8,2,10,0,2
40,KHpHZBedt0m29dJZKnQd3Q==,1,1,1,1,1,3,1,2,1,1
39,JV9c2CbxEEiFZEXr07shmQ==,1,1,1,1,1,1,1,1,2,2
38,ItT0hWCka0GkeRDCHSOeEg==,1,1,1,7,1,5,2,26,0,1


The most applied to job received 17 applications.

In [7]:
data.groupby(['ApplicationId', 'dateAppliedAt'], as_index=False).nunique().sort_values(by='JobTitle', ascending=False).head()

Unnamed: 0,ApplicationId,dateAppliedAt,OpportunityId,Title,ExternalDescription,WhatYouDid,JobTitle,LicenseAndCertification,skill,major,degree
97,eymRGI00kEu533ZedmHMuA==,2020-08-28 19:26:46.875,1,1,1,11,17,1,17,2,2
100,gioI79njOEmfQX+HgFHGWQ==,2019-04-09 08:57:38.679,1,1,1,12,14,1,29,2,2
94,e/TH2fLIEUuNnbV4AkcAvA==,2023-01-03 22:57:21.829,1,1,1,10,10,2,50,0,4
25,9f1jcTY//UCBvE1JT+5u3g==,2020-09-07 20:34:39.781,1,1,1,9,10,1,5,3,4
99,gh2rm3WZY0mkjpsH+z9rsQ==,2020-09-20 21:37:49.219,1,1,1,10,10,3,4,2,3


Each application only went to one job (to be expected)

In [8]:
data.loc[data['ApplicationId'] == '9d0Ik6f0aUKmo6GF1lAqGw=='] #Sample application

Unnamed: 0,ApplicationId,OpportunityId,Title,ExternalDescription,WhatYouDid,dateAppliedAt,JobTitle,LicenseAndCertification,skill,major,degree
45434,9d0Ik6f0aUKmo6GF1lAqGw==,cOibKSS1H0GnB+tTy2Sc+w==,LPN - 4 South - FT - 7:00 PM-7:00 AM,<p>A Licensed Practical Nurse is responsible f...,I started off as a Nurse Intern for 10 months ...,2019-03-29 00:16:50.357,LPN,L-AL-MVR Drivers License,IV push certification,Nursing,LPN Certificate
45435,9d0Ik6f0aUKmo6GF1lAqGw==,cOibKSS1H0GnB+tTy2Sc+w==,LPN - 4 South - FT - 7:00 PM-7:00 AM,<p>A Licensed Practical Nurse is responsible f...,I started off as a Nurse Intern for 10 months ...,2019-03-29 00:16:50.357,LPN,L-AL-MVR Drivers License,IV insertion,Nursing,LPN Certificate
45436,9d0Ik6f0aUKmo6GF1lAqGw==,cOibKSS1H0GnB+tTy2Sc+w==,LPN - 4 South - FT - 7:00 PM-7:00 AM,<p>A Licensed Practical Nurse is responsible f...,I started off as a Nurse Intern for 10 months ...,2019-03-29 00:16:50.357,LPN,L-AL-MVR Drivers License,Wound Care,Nursing,LPN Certificate
45437,9d0Ik6f0aUKmo6GF1lAqGw==,cOibKSS1H0GnB+tTy2Sc+w==,LPN - 4 South - FT - 7:00 PM-7:00 AM,<p>A Licensed Practical Nurse is responsible f...,I started off as a Nurse Intern for 10 months ...,2019-03-29 00:16:50.357,LPN,L-AL-MVR Drivers License,Tube feeding,Nursing,LPN Certificate
45438,9d0Ik6f0aUKmo6GF1lAqGw==,cOibKSS1H0GnB+tTy2Sc+w==,LPN - 4 South - FT - 7:00 PM-7:00 AM,<p>A Licensed Practical Nurse is responsible f...,I started off as a Nurse Intern for 10 months ...,2019-03-29 00:16:50.357,LPN,L-AL-MVR Drivers License,administering meds,Nursing,LPN Certificate
45439,9d0Ik6f0aUKmo6GF1lAqGw==,cOibKSS1H0GnB+tTy2Sc+w==,LPN - 4 South - FT - 7:00 PM-7:00 AM,<p>A Licensed Practical Nurse is responsible f...,I started off as a Nurse Intern for 10 months ...,2019-03-29 00:16:50.357,LPN,L-AL-MVR Drivers License,administration of meds in GI tubes,Nursing,LPN Certificate
45440,9d0Ik6f0aUKmo6GF1lAqGw==,cOibKSS1H0GnB+tTy2Sc+w==,LPN - 4 South - FT - 7:00 PM-7:00 AM,<p>A Licensed Practical Nurse is responsible f...,I started off as a Nurse Intern for 10 months ...,2019-03-29 00:16:50.357,LPN,L-AL-MVR Drivers License,Urinary Catheter insurtion,Nursing,LPN Certificate
45441,9d0Ik6f0aUKmo6GF1lAqGw==,cOibKSS1H0GnB+tTy2Sc+w==,LPN - 4 South - FT - 7:00 PM-7:00 AM,<p>A Licensed Practical Nurse is responsible f...,I started off as a Nurse Intern for 10 months ...,2019-03-29 00:16:50.357,LPN,L-AL-MVR Drivers License,catheter care,Nursing,LPN Certificate
45442,9d0Ik6f0aUKmo6GF1lAqGw==,cOibKSS1H0GnB+tTy2Sc+w==,LPN - 4 South - FT - 7:00 PM-7:00 AM,<p>A Licensed Practical Nurse is responsible f...,I started off as a Nurse Intern for 10 months ...,2019-03-29 00:16:50.357,LPN,L-AL-MVR Drivers License,trach care,Nursing,LPN Certificate
45443,9d0Ik6f0aUKmo6GF1lAqGw==,cOibKSS1H0GnB+tTy2Sc+w==,LPN - 4 South - FT - 7:00 PM-7:00 AM,<p>A Licensed Practical Nurse is responsible f...,I started off as a Nurse Intern for 10 months ...,2019-03-29 00:16:50.357,LPN,L-AL-MVR Drivers License,colostomy/ostomy care,Nursing,LPN Certificate


The application data a candidate submits is vast, so it is distributed across many rows. Individual skills, degrees, and experiences all have their own lines. This information can be aggregated to give an overall understanding of the candidate.

In [9]:
# Looking at randomly selected applicants' experience data

random_index = random.randint(0, len(data.WhatYouDid) - 1)
print(data.WhatYouDid[random_index])

Responsible for assisting the district curriculum and instruction department with the development,
implementation, monitoring, and evaluation of science curriculum for all high schools. Provided
campus support for administrators and teachers in generating reports and the disaggregation of student performance data for intervention and compliance with the district's accountability
performance goals. Performed other duties as assigned by the district including monitoring of state testing and intake of testing materials for verification.


### Notes: 
- ApplicationId and dateAppliedAt have the same number of unique values. that means every application was made at a different time. This is what we'd expect.
- There are multiple rows dedicated to each application as every skill, degree, certification, license and WhatYouDid gets its own row.
- Many applicants have held multiple roles (some of which held non-unique titles), others held no experience.
- Applicant info is duplicated heavily since the information is in row format. It will be aggregated on the application level.

In [10]:
data.loc[data.WhatYouDid.isna()].shape

(4245, 11)

In [11]:
data.groupby('ApplicationId', as_index=False).nunique().sort_values(by='OpportunityId', ascending=False)

Unnamed: 0,ApplicationId,OpportunityId,Title,ExternalDescription,WhatYouDid,dateAppliedAt,JobTitle,LicenseAndCertification,skill,major,degree
0,/eT9zghWr068l9Ba2k/TOg==,1,1,1,6,1,5,1,21,1,5
120,rY3Gn7dBwUOGUdZJmScz9Q==,1,1,1,7,1,5,1,17,3,5
112,m7wZYtTJN02bk3ARH/855Q==,1,1,1,3,1,3,1,14,1,1
113,m84E20F3j0Oh0PGgYUn9+w==,1,1,1,0,1,6,1,13,1,1
114,m8o0XcBAZU2mFt10Z7YfLA==,1,1,1,0,1,7,1,45,1,1
...,...,...,...,...,...,...,...,...,...,...,...
60,TSxV4DAUnUSu0qj0/LpbmQ==,1,1,1,4,1,4,1,12,1,2
61,TTEefLmJ4kKGMva97tJ8kA==,1,1,1,5,1,4,1,18,0,1
62,TTIor24kGUe3DjHpu+rSgA==,1,1,1,4,1,4,1,16,0,1
63,TTOloc6EBEGicB6K+6y6tQ==,1,1,1,1,1,1,1,1,0,1


In [12]:
data.groupby('OpportunityId', as_index=False).nunique().sort_values(by='ApplicationId', ascending=False)

Unnamed: 0,OpportunityId,ApplicationId,Title,ExternalDescription,WhatYouDid,dateAppliedAt,JobTitle,LicenseAndCertification,skill,major,degree
78,dmlXNFI0MEOlW+qh07E4iQ==,17,1,1,86,17,88,30,141,16,39
123,wPH/PTSFV0SBmSzZlWnftA==,4,1,1,16,4,15,7,43,2,7
5,4xZy2W9hJkG3QJH7L/yLxw==,4,1,1,3,4,5,4,22,3,8
77,dFx5AVFlUUqQ5MW3Z0oQwg==,3,1,1,11,3,11,6,76,3,8
75,cn146CawdUKr/7Ha4/tMpA==,3,1,1,10,3,12,4,56,2,8
...,...,...,...,...,...,...,...,...,...,...,...
41,KURKW3m+Ik2y6BNjhBa2uA==,1,1,1,9,1,8,2,10,0,2
40,KHpHZBedt0m29dJZKnQd3Q==,1,1,1,1,1,3,1,2,1,1
39,JV9c2CbxEEiFZEXr07shmQ==,1,1,1,1,1,1,1,1,2,2
38,ItT0hWCka0GkeRDCHSOeEg==,1,1,1,7,1,5,2,26,0,1


### Above we see that the important info here is the nunique(Application Id), as the rest are either info on the Opportunity or aggregated info across applicants

In [13]:
applicant_columns = ['ApplicationId', 'dateAppliedAt', 'JobTitle', "WhatYouDid", "LicenseAndCertification",
                     "major", "degree", "skill"]

opportunity_columns = ['OpportunityId', 'Title', 'ExternalDescription']

In [14]:
data.groupby(opportunity_columns).nunique().sample(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ApplicationId,WhatYouDid,dateAppliedAt,JobTitle,LicenseAndCertification,skill,major,degree
OpportunityId,Title,ExternalDescription,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
gOWTKiYjt0yTMPKvUM30lw==,Store Associate,"<p>Our store employees are the face of the ALDI shopping experience. Their hard work makes it possible to uphold our company philosophy, providing quality products at the best possible price. Their smiles and pleasant demeanors keep customers coming back time and time again. Our store employees work many roles - from store associate to cashier to stocker - while providing excellent customer service. As a store employee, you're also responsible for merchandising product, monitoring inventory and keeping the store looking its best. It's an opportunity to get more out of your career and grow in an exciting environment. <br /><br />Position Type: Part-Time <br />Starting Wage: $12.70/hour <br /><br />Duties and Responsibilities: <br />Must be able to perform duties with or without reasonable accommodation. <br /><br />&bull; Assists management in achieving store appearance and maintenance standards. <br />&bull; Identifies and rectifies hazards and/or equipment in need of maintenance to provide a safe environment. <br />&bull; Assists management in achieving payroll and total loss goals. <br />&bull; Assists customers with problems or concerns, and contacts management as appropriate regarding customers with problems or concerns. <br />&bull; Provides feedback to management on all products, inventory losses, scanning errors, and general issues that could impact productivity. <br />&bull; Processes customer purchases, performs general cleaning duties, and stocks shelves and displays neatly to maximize visibility and sales. <br />&bull; Participates in taking store inventory counts according to guidelines. <br />&bull; Complies with all established company policies and procedures. <br />&bull; Collaborates with team members and communicates relevant information to direct leader. <br />&bull; Upholds the security and confidentiality of documents and data within area of responsibility. <br />&bull; Other duties as assigned. <br /><br />Education and Experience: <br /><br />&bull; High School Diploma or equivalent preferred. <br />&bull; Prior work experience in a retail environment preferred. <br />&bull; A combination of education and experience providing equivalent knowledge. <br /><br />Job Qualifications: <br />Knowledge/Skills/Abilities <br /><br />&bull; Provides prompt and courteous customer service. <br />&bull; Ability to operate a cash register efficiently and accurately. <br />&bull; Ability to safely and properly operate equipment, including electric/manual hand jack, floor scrubber, and cardboard baler. <br />&bull; Ability to perform general cleaning duties to company standards. <br />&bull; Ability to interpret and apply company policies and procedures. <br />&bull; Excellent verbal and written communication skills. <br />&bull; Gives attention to detail and follows instructions. <br />&bull; Ability to work both independently and within a team environment. <br />&bull; Ability to stay organized and multi-task in a professional and efficient manner. <br />&bull; Meets any state and local requirements for handling and selling alcoholic beverages. <br /><br />Physical Demands: <br /><br />&bull; Ability to stock merchandise from store receiving to shelving. <br />&bull; Ability to place product, weighing up to 45 pounds, on shelving at various heights. <br />&bull; Regularly required to sit, stand, bend, reach, push, pull, lift, carry, and walk about the store.</p>",1,3,1,3,1,4,0,2
vepl7eCMr0mzIomfOnUkZQ==,Development Coach,"<p><strong>Department: Player Development</strong></p>\n<p><strong>Supervisors: Fundamentals Coordinator and GCL Team Manager</strong></p>\n<p><strong>Classification: Full-Time</strong></p>\n<p>&nbsp;</p>\n<p><strong>&nbsp;</strong></p>\n<p><strong>Summary</strong><strong>:</strong></p>\n<p>The Houston Astros are currently seeking a Development Coach in the Player Development Department. Serving as a uniformed coach at our Florida Complex in West Palm Beach, FL, the Development Coach will aid in the implementation of player development initiatives while handling the conventional responsibilities of a 4th coach.</p>\n<p>&nbsp;</p>\n<p><strong>Essential Duties / Responsibilities</strong><strong>:</strong></p>\n<ul>\n<li>Collaborate with field staff, rovers and front office to implement and help improve player development initiatives.</li>\n<li>Assist with the evaluation and coaching of fundamentals.</li>\n<li>Assist the field staff with conventional responsibilities of a 4th coach, such as throwing batting practice, hitting fungoes, and coaching first base.</li>\n<li>Assist the staff and players with the implementation of systems and technologies.</li>\n<li>Assist in the application of analytic tools provided by the front office.</li>\n<li>Perform other duties as assigned.</li>\n</ul>\n<p>&nbsp;</p>\n<p><strong>Requirements / Qualifications</strong><strong>:</strong></p>\n<ul>\n<li>Interest and understanding of player development and analytics.</li>\n<li>Ability to understand and advocate for changes due to new information and or tools in baseball.</li>\n<li>Ability to work and communicate well with people in a wide variety of roles including players, coaches and front office.</li>\n<li>Strong computer skills and proficient in Microsoft Office.</li>\n<li>Professional or collegiate playing experience is a plus.</li>\n<li>Fluency in Spanish is a plus.</li>\n<li>Strength &amp; Conditioning education and experience is a plus.</li>\n<li>Technical degree is a plus.</li>\n</ul>\n<p>&nbsp;</p>\n<p><strong>Work Environment</strong></p>\n<p>This job operates both outside in a baseball field and office setting. This role will involve being in the heat and on the individual&rsquo;s feet for long periods of time. At times the noise could become excessive.&nbsp;</p>\n<p>&nbsp;</p>\n<p><strong>Physical Demands</strong></p>\n<p>The physical demands described here are representative of those that must be met by an employee to successfully perform the essential functions of this job. This role routinely will be in an inside or outside type of environment for long periods of time, of which includes bending, throwing, and swinging a bat. &nbsp;This role will require the individual to be physically moving for long periods of time. This role will require the individual to lift and carry up to 25 pounds throughout the day. The individual must be able to visual acuity to distinguish pitches and facilitate instruction.</p>\n<p>&nbsp;</p>\n<p><strong>Position Type and Expected Hours of Work</strong></p>\n<p>Ability to work a flexible schedule, including; extended hours, evenings, weekends, and holidays.</p>\n<p>&nbsp;</p>\n<p><strong>Travel</strong></p>\n<p>Often travel to our minor league affiliates and other locations as required, to include, but not limited Round Rock, TX; Corpus Christi, TX; Fayetteville, NC; Quad Cities, IA; West Palm Beach, FL; and Troy, NY.</p>\n<p><strong>&nbsp;</strong></p>\n<p><strong>Other Duties</strong></p>\n<p>Please note this job description is not designed to cover or contain a comprehensive listing of activities, duties or responsibilities that are required of the employee for this job. Duties, responsibilities and activities may change at any time with or without notice.</p>\n<p>&nbsp;</p>\n<p>&nbsp;</p>\n<p><em>We are an equal opportunity employer and all qualified applicants will receive consideration for employment without regard to race, color, religion, sex, national origin, disability status, protected veteran status, or any other characteristic protected by law.</em></p>\n<p><em>EOE/M/F/Vet/Disability</em></p>",1,10,1,9,1,38,1,5


In [18]:
# for each of these columns, make and print a dataframe of the value counts ['JobTitle	LicenseAndCertification	skill	major	degree]
data_dict = {}
for col in ['JobTitle', 'LicenseAndCertification', 'skill', 'major', 'degree']:
    data_dict[col] = pd.DataFrame(data[col].value_counts())
    print(data_dict[col])

                                           count
JobTitle                                        
Paralegal                                   2700
Student Nurse                               1584
Science Teacher/Coach                       1530
Registered Nurse                            1246
Science Teacher/Head Boys Basketball        1020
...                                          ...
Wrapper                                        1
Finance Counselor/ Admissions Rep              1
Pharmacy Technician/ Charge Reconciliator      1
Machine operator                               1
Deboner                                        1

[612 rows x 1 columns]
                                                    count
LicenseAndCertification                                  
Registered Nurse                                     2979
RN                                                   2292
FACHE                                                2090
Nursing Professional Development                 