# Cleaning the text across the applications

In [13]:
import pandas as pd
import numpy as np

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import warnings

# Ignore ipykernel warning
warnings.filterwarnings('ignore', category=DeprecationWarning, module='ipykernel')

In [14]:
applications = pd.read_parquet('../../Data/split_4_cleaned.parquet')

In [15]:
applications.columns

Index(['OpportunityId', 'ApplicationId', 'ExternalBriefDescription',
       'ExternalDescription', 'Title', 'pass_first_step', 'Step_Category',
       'Applicant_Job_Titles', 'Applicant_Job_Responsibilities',
       'Applicant_Education', 'Applicant_Reported_Skills'],
      dtype='object')

In [16]:
# Change column names
applications.columns = ['opportunity_id', 'application_id', 'opportunity_brief_description',
            'opportunity_description', 'opportunity_title', 'application_pass_first_step', 'application_step_category',
            'application_job_titles', 'application_job_responsibilities',
            'application_education', 'application_reported_skills']
# Remove duplicates for job_descriptions
original_descriptions = applications.opportunity_description.unique()

In [17]:
#These are all lists of strings
cols_not_to_concat = ['application_id', 'application_pass_first_step', 'application_step_category']
application_cols = [col for col in applications.columns if 'application_' in col and col not in cols_not_to_concat]
opportunity_cols = [col for col in applications.columns if 'opportunity_' in col and col != 'opportunity_id']

applications['application_concat'] = applications[application_cols].astype(str).agg('--'.join,axis=1)

applications['application_full_tokenized'] = applications['application_concat'].apply(nltk.word_tokenize)

In [18]:
application_cols.append("application_concat")
application_cols.append("application_full_tokenized")

In [19]:
#If column is object, it will become string
for column in applications.columns:
    if applications[column].dtype == object:
        applications[column] = applications[column].astype("string")

In [20]:
applications.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22053 entries, 66159 to 88211
Data columns (total 13 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   opportunity_id                    22053 non-null  string
 1   application_id                    22053 non-null  string
 2   opportunity_brief_description     22053 non-null  string
 3   opportunity_description           22053 non-null  string
 4   opportunity_title                 22053 non-null  string
 5   application_pass_first_step       22053 non-null  bool  
 6   application_step_category         22053 non-null  int64 
 7   application_job_titles            22053 non-null  string
 8   application_job_responsibilities  22053 non-null  string
 9   application_education             22053 non-null  string
 10  application_reported_skills       22053 non-null  string
 11  application_concat                22053 non-null  string
 12  application_fu

## Preprocessing the data

In [21]:
applications.columns

Index(['opportunity_id', 'application_id', 'opportunity_brief_description',
       'opportunity_description', 'opportunity_title',
       'application_pass_first_step', 'application_step_category',
       'application_job_titles', 'application_job_responsibilities',
       'application_education', 'application_reported_skills',
       'application_concat', 'application_full_tokenized'],
      dtype='object')

In [22]:
columns_to_preprocess = ['opportunity_brief_description', 'opportunity_description', 'opportunity_title',
                        'application_job_titles', 'application_job_responsibilities',
                        'application_concat', 'application_full_tokenized']

In [23]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\b\w+\b')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def strip_html_tags(text):
    html_pattern = re.compile('<.*?>')
    text = re.sub(html_pattern, '', text)
    return text

def preprocess_text(text):
    # Strip HTML tags and convert to lowercase
    text = strip_html_tags(text).lower()
    
    # Remove unwanted strings using compiled regex pattern
    # This changed
    unwanted_strings = ['sign on bonus', 'sign bonus',
                         'full time', 'part time',
                         'day shift', 'night shift', 'second shift',
                         'third shift', 'first shift', 'none', 'nbsp', 'amp']
    unwanted_pattern = re.compile('|'.join(map(re.escape, unwanted_strings)))
    text = re.sub(unwanted_pattern, '', text)

    # Tokenize the text and remove stopwords while lemmatizing
    tokens = [lemmatizer.lemmatize(token) for token in tokenizer.tokenize(text) if token not in stop_words]

    # Join the tokens back into a single string
    text = ' '.join(tokens)

    return text

for column in columns_to_preprocess:
    print(f"working on {column}")
    applications[column] = applications[column].apply(preprocess_text)

applications = applications.astype(str)

working on opportunity_brief_description


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danmarino/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/danmarino/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


working on opportunity_description
working on opportunity_title
working on application_job_titles
working on application_job_responsibilities
working on application_concat
working on application_full_tokenized


In [24]:
# Re-run the test code to see if the data types have changed
for col in application_cols:
    print("Column:", col)
    dtypes_of_items = set()  # Use a set to store unique data types
    for cell in applications[col]:
        for item in cell:
            if item is not None:
                dtypes_of_items.add(type(item))
            else:
                dtypes_of_items.add(type(None))
    print("Unique data types:", dtypes_of_items)

Column: application_job_titles
Unique data types: {<class 'str'>}
Column: application_job_responsibilities
Unique data types: {<class 'str'>}
Column: application_education
Unique data types: {<class 'str'>}
Column: application_reported_skills
Unique data types: {<class 'str'>}
Column: application_concat
Unique data types: {<class 'str'>}
Column: application_full_tokenized
Unique data types: {<class 'str'>}


### Everything is a string, which is essential for compatability with the model

In [25]:
applications.sample(3)

Unnamed: 0,opportunity_id,application_id,opportunity_brief_description,opportunity_description,opportunity_title,application_pass_first_step,application_step_category,application_job_titles,application_job_responsibilities,application_education,application_reported_skills,application_concat,application_full_tokenized
82227,1C4XzVi21UWFqTKe+nZ7Sw==,Fx2VS8W+2U6hpAX/a8ja9Q==,15 00 per hour orkin purpose help protect worl...,15 00 per hour orkin purpose help protect worl...,service technician train,False,0,student loan advocate lead generator appointme...,assist student grievance regarding student loa...,Associates,Microsoft Office++Customer Service++Financial ...,student loan advocate lead generator appointme...,student loan advocate lead generator appointme...
85580,+0cRKl9xvkGTQq4D8IwnAQ==,b3CzgKctxECMigmjIvtW+A==,first year earnings opportunity 40 000 50 000 ...,first year earnings opportunity 40 000 50 000 ...,service technician train,True,0,production,make refrigerator,,,production make refrigerator,production make refrigerator
69959,CfU7r5cLfEua4pLMBVkS5Q==,dNp19WQwz0G3jYvHsusHNQ==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,service technician train,False,0,general worker,operated water extraction carpet cleaning truck,1 year,,general worker operated water extraction carpe...,general worker operated water extraction carpe...


### We want zero NoneType

In [26]:
applications.to_parquet('../../Data/split_4_preprocessed.parquet')