# Cleaning the text across the applications

In [16]:
import pandas as pd
import numpy as np


import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import warnings
from tqdm import tqdm
from bs4 import BeautifulSoup

# Ignore ipykernel warning
warnings.filterwarnings('ignore', category=DeprecationWarning, module='ipykernel')

In [17]:
applications = pd.read_pickle('../../Data/consolidated_applications.pkl')

In [18]:
# # Enriched data provided through collaborator

# normalized_title_addition = pd.read_pickle('../Data/app_opp_normalized.pkl')[['ApplicationId', 'NormalizedTitle', 'Title_Normalized_and_skills']]
# normalized_title_addition.info()

# # Enrich the data using the manually normalized titles
# # applications = applications.merge(normalized_title_addition, on='ApplicationId', how='left')

In [19]:
applications.columns = ['application_id', 'job_id', 'job_title', 'job_description',
                        'applicant_background', 'applicant_titles', 'applicant_skills',
                        'applicant_major', 'applicant_licenses', 'applicant_degrees',
                        #'normalized_job_title', 'job_title_skill_extraction'
                        ]

# Remove duplicates for job_descriptions
original_descriptions = applications.job_description.unique()

In [20]:
#These are all lists of strings
applicant_cols = ['applicant_background', 'applicant_titles',
                  'applicant_skills', 'applicant_major',
                  'applicant_licenses', 'applicant_degrees']

applications['full_app'] = applications[applicant_cols].astype(str).agg('- '.join, axis=1)

applications['full_app_tokenized'] = applications['full_app'].apply(nltk.word_tokenize)

applications[applicant_cols]

Unnamed: 0,applicant_background,applicant_titles,applicant_skills,applicant_major,applicant_licenses,applicant_degrees
0,[Haven Home Health\nJan 2019 – present\nRespon...,"[RN , RN]","[Adjustments, Oasis, Cpr, Cpr/, Cpr/bls, Rn, R...","[Nursing, None]",[Rn],"[Bachelor's Degree Anticipated 2020, Associate..."
1,[Worked as frontend developer\nTitle: Ship Rec...,[Front end Developer],"[Front end developer, Reactjs, C++, Javascript...","[Computer Science, Secondary Education]",[python web development],"[M.C.A, B.C.A, HSC, SSLC]"
2,[• Coordinated Reconstructive Plastics and ENT...,"[RN Service Line Coordinator, RN Circulator, R...","[Certified nursing assistant, Nursing assistan...",[Nursing],[registered nurse],[Associates]
3,"[Oklahoma City, OK • 02/2019 - 2/2020\n•\t\tPh...","[Medical Assistant, Cashier]","[Phlebotomist, Phlebotomy, Scheduling, Stockin...",[None],[Nationally Registered Certified Medical Assis...,[Associates]
4,"[PerfectLaw® is Legal case management, time, b...",[Customer Engineer/ Analyst],"[Business Intelligence, Sql, Billing, Customer...",[Computer Information Sys.],[Microsoft Certified Prof.\Querying Microsoft ...,[Masters of Science in Management Information ...
...,...,...,...,...,...,...
170,"[None, Taught 9th-12th grade, Gifted/Talented,...","[English, Reading, Media Arts Instructor, Gift...","[Instructor, Social services, Aerospace, Aeros...","[English, Communication]","[Texas Secondary English- Lifetime, Secondary ...","[M.A., Bachelors]"
171,[• Handled in-store returns\n• Received an...,[Sales Associate],"[Sales associate, Inventory, Retail sales, Cas...",[None],"[n/a, I am not certified in anything.]",[High school diploma]
172,[Drove to various stores and gas stations in s...,"[Delivery Driver, Front Desk Clerk, Bagger]","[Friendly , Courteous, Responsible, People Ski...",[None],[CDL B],"[Diploma, none]"
173,[Oversee and provide clinical leadership and c...,"[Mental Health Coordinator, Mental Health Spec...","[Mental health, Cism, Therapy]","[Rehabilitation Counseling, Psychology]",[LPC - MH Service Provider],[Master of Arts in Rehabilitation Counseling w...


In [21]:
# We want to see which columns have have any incompatable data types

for col in applicant_cols:
    print("Column:", col)
    dtypes_of_items = set()  # Use a set to store unique data types
    for cell in applications[col]:
        for item in cell:
            if item is not None:
                dtypes_of_items.add(type(item))
            else:
                dtypes_of_items.add(type(None))
    print("Unique data types:", dtypes_of_items)

Column: applicant_background
Unique data types: {<class 'NoneType'>, <class 'str'>}
Column: applicant_titles
Unique data types: {<class 'str'>}
Column: applicant_skills
Unique data types: {<class 'str'>}
Column: applicant_major
Unique data types: {<class 'NoneType'>, <class 'str'>}
Column: applicant_licenses
Unique data types: {<class 'str'>}
Column: applicant_degrees
Unique data types: {<class 'str'>}


When cells have NoneType values in their lists, we have pre-processing errors. This will be remedied by changing it to a string saying 'Nothing'
- we do this because an empty list could cause errors

In [22]:
# Now let's replace None values in the specified columns
for col in ["applicant_background", "applicant_major"]:
    applications[col] = applications[col].apply(lambda cell: ['Nothing Here' if item is None else item for item in cell])
    applications[col] = applications[col].apply(lambda cell: [item for i, item in enumerate(cell) if i == 0 or item != 'Nothing Here'])
applications

Unnamed: 0,application_id,job_id,job_title,job_description,applicant_background,applicant_titles,applicant_skills,applicant_major,applicant_licenses,applicant_degrees,full_app,full_app_tokenized
0,/fxoH1a24kqd47O40at4Aw==,2tsc9mT5TUGegYdbZritnw==,RN,"<p><em><strong>Up to $10,000 Sign On Bonus for...",[Haven Home Health\nJan 2019 – present\nRespon...,"[RN , RN]","[Adjustments, Oasis, Cpr, Cpr/, Cpr/bls, Rn, R...",[Nursing],[Rn],"[Bachelor's Degree Anticipated 2020, Associate...",['Haven Home Health\nJan 2019 – present\nRespo...,"[[, 'Haven, Home, Health\nJan, 2019, –, presen..."
1,3fYHWWMEFkW66JG5EfbqGA==,4xZy2W9hJkG3QJH7L/yLxw==,Associate Software Engineer,<p>We are looking for Graduate hire from colle...,[Worked as frontend developer\nTitle: Ship Rec...,[Front end Developer],"[Front end developer, Reactjs, C++, Javascript...","[Computer Science, Secondary Education]",[python web development],"[M.C.A, B.C.A, HSC, SSLC]",['Worked as frontend developer\nTitle: Ship Re...,"[[, 'Worked, as, frontend, developer\nTitle, :..."
2,3jSDFoBVv0irmCyvjgNu8Q==,kKB8TZgICEmQD+v9Euh85Q==,RN-OR,"<p style=""text-align: center; color: #222222;""...",[• Coordinated Reconstructive Plastics and ENT...,"[RN Service Line Coordinator, RN Circulator, R...","[Certified nursing assistant, Nursing assistan...",[Nursing],[registered nurse],[Associates],['• Coordinated Reconstructive Plastics and EN...,"[[, '•, Coordinated, Reconstructive, Plastics,..."
3,tA6YXkZOx0iI/GhHIzy3Dw==,BksQ3ElZw0C3GWRipnCUng==,Certified Medical Assistant,"<p><span style=""text-decoration: underline;""><...","[Oklahoma City, OK • 02/2019 - 2/2020\n•\t\tPh...","[Medical Assistant, Cashier]","[Phlebotomist, Phlebotomy, Scheduling, Stockin...",[Nothing Here],[Nationally Registered Certified Medical Assis...,[Associates],"['Oklahoma City, OK • 02/2019 - 2/2020\n•\t\tP...","[[, 'Oklahoma, City, ,, OK, •, 02/2019, -, 2/2..."
4,tAHiIMFzoE6wz/cPJXkTLg==,dFx5AVFlUUqQ5MW3Z0oQwg==,BUSINESS PROCESS ANALYST,<p>Ultimate Software is looking for two Busine...,"[PerfectLaw® is Legal case management, time, b...",[Customer Engineer/ Analyst],"[Business Intelligence, Sql, Billing, Customer...",[Computer Information Sys.],[Microsoft Certified Prof.\Querying Microsoft ...,[Masters of Science in Management Information ...,"[""PerfectLaw® is Legal case management, time, ...","[[, ``, PerfectLaw®, is, Legal, case, manageme..."
...,...,...,...,...,...,...,...,...,...,...,...,...
170,EyCxJkZEVEuItmFl3Odxig==,dmlXNFI0MEOlW+qh07E4iQ==,"Job Fair, Virtual Teachers",<p><span>Edmentum is committed to making it ea...,"[Nothing Here, Taught 9th-12th grade, Gifted/T...","[English, Reading, Media Arts Instructor, Gift...","[Instructor, Social services, Aerospace, Aeros...","[English, Communication]","[Texas Secondary English- Lifetime, Secondary ...","[M.A., Bachelors]","[None\n 'Taught 9th-12th grade, Gifted/Talente...","[[, None, 'Taught, 9th-12th, grade, ,, Gifted/..."
171,XLLuux6JfUO7yhd2fYaccQ==,wvaccu8cwUibyg4hpMdcsw==,Part-Time Store Associate,<p>As a Store Associate you&rsquo;ll be respon...,[• Handled in-store returns\n• Received an...,[Sales Associate],"[Sales associate, Inventory, Retail sales, Cas...",[Nothing Here],"[n/a, I am not certified in anything.]",[High school diploma],['• Handled in-store returns\n• Received a...,"[[, '•, Handled, in-store, returns\n•, Receive..."
172,KbyabDyErUqXDSpuv7yOmw==,gOWTKiYjt0yTMPKvUM30lw==,Store Associate,<p>Our store employees are the face of the ALD...,[Drove to various stores and gas stations in s...,"[Delivery Driver, Front Desk Clerk, Bagger]","[Friendly , Courteous, Responsible, People Ski...",[Nothing Here],[CDL B],"[Diploma, none]",['Drove to various stores and gas stations in ...,"[[, 'Drove, to, various, stores, and, gas, sta..."
173,PShnnuv+dUW3J1XjUD11IA==,L/Ki23KWHkq5yzeU7ToOjQ==,"Mental Health Specialist, Correctional Services",<p><strong>MHC: Where We Face Challenges<span>...,[Oversee and provide clinical leadership and c...,"[Mental Health Coordinator, Mental Health Spec...","[Mental health, Cism, Therapy]","[Rehabilitation Counseling, Psychology]",[LPC - MH Service Provider],[Master of Arts in Rehabilitation Counseling w...,"[""Oversee and provide clinical leadership and ...","[[, ``, Oversee, and, provide, clinical, leade..."


## Preprocessing the data

In [23]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

def strip_html_tags(text):
    html_pattern = re.compile('<.*?>')
    text = re.sub(html_pattern, '', text)
    return text

def preprocess_text(text):
    # Strip HTML tags and convert to lowercase
    text = strip_html_tags(text).lower()
    
    # Remove unwanted strings using compiled regex pattern
    # This changed
    unwanted_strings = ['sign on bonus', 'sign bonus',
                         'full time', 'ft', 'part time', 'pt',
                        'day shift', 'night shift', 'second shift',
                         'third shift', 'first shift', 'none', 'nbsp', 'amp']
    unwanted_pattern = re.compile('|'.join(map(re.escape, unwanted_strings)))
    text = re.sub(unwanted_pattern, '', text)

    # Tokenize the text and remove stopwords while lemmatizing
    # this changed
    tokenizer = RegexpTokenizer(r'\b\w+\b')
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokenizer.tokenize(text) if token not in stop_words]

    # Join the tokens back into a single string
    text = ' '.join(tokens)

    return text

# Apply pre-processing to the columns that need it
preprocessing_columns = ['job_title', 'job_description', 'applicant_background',
                         'applicant_titles', 'applicant_skills', 'applicant_major',
                         'applicant_licenses', 'applicant_degrees']

for column in preprocessing_columns:
    print(f"working on {column}")
    applications[column] = applications[column].astype('string').apply(preprocess_text)

applications = applications.astype(str)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danmarino/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/danmarino/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


working on job_title
working on job_description
working on applicant_background
working on applicant_titles
working on applicant_skills
working on applicant_major
working on applicant_licenses
working on applicant_degrees


In [24]:
# Re-run the test code to see if the data types have changed

for col in applicant_cols:
    print("Column:", col)
    dtypes_of_items = set()  # Use a set to store unique data types
    for cell in applications[col]:
        for item in cell:
            if item is not None:
                dtypes_of_items.add(type(item))
            else:
                dtypes_of_items.add(type(None))
    print("Unique data types:", dtypes_of_items)

Column: applicant_background
Unique data types: {<class 'str'>}
Column: applicant_titles
Unique data types: {<class 'str'>}
Column: applicant_skills
Unique data types: {<class 'str'>}
Column: applicant_major
Unique data types: {<class 'str'>}
Column: applicant_licenses
Unique data types: {<class 'str'>}
Column: applicant_degrees
Unique data types: {<class 'str'>}


### We want zero NoneType

In [25]:
applications.to_pickle('../../Data/preprocessed_applications.pkl')