# Cleaning the text across the applications

In [1]:
import pandas as pd
import numpy as np


import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import warnings
from tqdm import tqdm
from bs4 import BeautifulSoup

# Ignore ipykernel warning
warnings.filterwarnings('ignore', category=DeprecationWarning, module='ipykernel')

In [2]:
applications = pd.read_parquet('../../Data/split_4_cleaned.parquet')

In [3]:
applications.columns

Index(['OpportunityId', 'ApplicationId', 'ExternalBriefDescription',
       'ExternalDescription', 'Title', 'pass_first_step', 'Step_Category',
       'Applicant_Job_Titles', 'Applicant_Job_Responsibilities',
       'Applicant_Education', 'Applicant_Reported_Skills'],
      dtype='object')

In [4]:
# Change column names
applications.columns = ['opportunity_id', 'application_id', 'opportunity_brief_description',
            'opportunity_description', 'opportunity_title', 'application_pass_first_step', 'application_step_category',
            'application_job_titles', 'application_job_responsibilities',
            'application_education', 'application_reported_skills']
# Remove duplicates for job_descriptions
original_descriptions = applications.opportunity_description.unique()

In [5]:
#These are all lists of strings
application_cols = [col for col in applications.columns if 'application_' in col and col != 'application_id']
opportunity_cols = [col for col in applications.columns if 'opportunity_' in col and col != 'opportunity_id']

applications['application_concat'] = applications[application_cols].astype(str).agg('--'.join,axis=1)

applications['application_full_tokenized'] = applications['application_concat'].apply(nltk.word_tokenize)

In [6]:
application_cols.append("application_concat")
application_cols.append("application_full_tokenized")

In [7]:
#If column is object, it will become string
for column in applications.columns:
    if applications[column].dtype == object:
        applications[column] = applications[column].astype("string")

In [8]:
applications.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22053 entries, 66159 to 88211
Data columns (total 13 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   opportunity_id                    22053 non-null  string
 1   application_id                    22053 non-null  string
 2   opportunity_brief_description     22053 non-null  string
 3   opportunity_description           22053 non-null  string
 4   opportunity_title                 22053 non-null  string
 5   application_pass_first_step       22053 non-null  bool  
 6   application_step_category         22053 non-null  int64 
 7   application_job_titles            22053 non-null  string
 8   application_job_responsibilities  22053 non-null  string
 9   application_education             22053 non-null  string
 10  application_reported_skills       22053 non-null  string
 11  application_concat                22053 non-null  string
 12  application_fu

In [9]:
# # Now let's replace None values in the specified columns
# for col in ["applicant_background", "applicant_major"]:
#     applications[col] = applications[col].apply(lambda cell: ['Nothing Here' if item is None else item for item in cell])
#     applications[col] = applications[col].apply(lambda cell: [item for i, item in enumerate(cell) if i == 0 or item != 'Nothing Here'])
# applications

## Preprocessing the data

In [10]:
applications.columns

Index(['opportunity_id', 'application_id', 'opportunity_brief_description',
       'opportunity_description', 'opportunity_title',
       'application_pass_first_step', 'application_step_category',
       'application_job_titles', 'application_job_responsibilities',
       'application_education', 'application_reported_skills',
       'application_concat', 'application_full_tokenized'],
      dtype='object')

In [11]:
columns_to_preprocess = ['opportunity_brief_description', 'opportunity_description', 'opportunity_title',
                        'application_job_titles', 'application_job_responsibilities',
                        'application_concat', 'application_full_tokenized']

In [12]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\b\w+\b')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def strip_html_tags(text):
    html_pattern = re.compile('<.*?>')
    text = re.sub(html_pattern, '', text)
    return text

def preprocess_text(text):
    # Strip HTML tags and convert to lowercase
    text = strip_html_tags(text).lower()
    
    # Remove unwanted strings using compiled regex pattern
    # This changed
    unwanted_strings = ['sign on bonus', 'sign bonus',
                         'full time', 'ft', 'part time', 'pt',
                        'day shift', 'night shift', 'second shift',
                         'third shift', 'first shift', 'none', 'nbsp', 'amp']
    unwanted_pattern = re.compile('|'.join(map(re.escape, unwanted_strings)))
    text = re.sub(unwanted_pattern, '', text)

    # Tokenize the text and remove stopwords while lemmatizing
    tokens = [lemmatizer.lemmatize(token) for token in tokenizer.tokenize(text) if token not in stop_words]

    # Join the tokens back into a single string
    text = ' '.join(tokens)

    return text

for column in columns_to_preprocess:
    print(f"working on {column}")
    applications[column] = applications[column].astype('string').apply(preprocess_text)

applications = applications.astype(str)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danmarino/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/danmarino/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


working on opportunity_brief_description
working on opportunity_description
working on opportunity_title
working on application_job_titles
working on application_job_responsibilities
working on application_concat
working on application_full_tokenized


In [13]:
# Re-run the test code to see if the data types have changed

for col in application_cols:
    print("Column:", col)
    dtypes_of_items = set()  # Use a set to store unique data types
    for cell in applications[col]:
        for item in cell:
            if item is not None:
                dtypes_of_items.add(type(item))
            else:
                dtypes_of_items.add(type(None))
    print("Unique data types:", dtypes_of_items)

Column: application_pass_first_step
Unique data types: {<class 'str'>}
Column: application_step_category
Unique data types: {<class 'str'>}
Column: application_job_titles
Unique data types: {<class 'str'>}
Column: application_job_responsibilities
Unique data types: {<class 'str'>}
Column: application_education
Unique data types: {<class 'str'>}
Column: application_reported_skills
Unique data types: {<class 'str'>}
Column: application_concat
Unique data types: {<class 'str'>}
Column: application_full_tokenized
Unique data types: {<class 'str'>}


### Everything is a string, which is essential for compatability with the model

In [14]:
applications.sample(3)

Unnamed: 0,opportunity_id,application_id,opportunity_brief_description,opportunity_description,opportunity_title,application_pass_first_step,application_step_category,application_job_titles,application_job_responsibilities,application_education,application_reported_skills,application_concat,application_full_tokenized
76141,DZ+lnarRi0i8A8bNuWVXoQ==,sEAd1SqMPU+gPLu4OloYag==,15 00 per hour orkin purpose help protect worl...,15 00 per hour orkin purpose help protect worl...,service technician train,True,0,charger fork driver picker laborer machine ope...,li load charging chamber use scrap pusher molt...,Some college,Fanuc++Vacuum++Box truck++Fork lift++Pallet ja...,true 0 charger fork driver picker laborer mach...,true 0 charger fork driver picker laborer mach...
80495,32SypfnHHku3j/lEyNszSg==,K2AwCoivvUumXnyQBEAK2w==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,pest control route service specialist,True,0,delivery driver,worked company contracted amazon deliver packa...,High School,Customer Service,true 0 delivery driver worked company contract...,true 0 delivery driver worked company contract...
79446,A6TNgNp5OkOaKht4leFEJA==,YbYao5x7j0ix/XgaSL8kgQ==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,service technician train,False,0,delivery specialist administrative photographe...,sale license administrative duty using salesfo...,Diploma,Customer service++Receptionist++Retail sales++...,false 0 delivery specialist administrative pho...,false 0 delivery specialist administrative pho...


### We want zero NoneType

In [16]:
applications.to_parquet('../../Data/split_4_preprocessed.parquet')