**This notebook is for testing data cleaning methods. It starts by loading a dataset.**

Try first with small subsets of of the datasets to find a working method of cleaning the full datasets. The below function already selects a subset.

In [None]:
import pandas as pd
import numpy as np
import random
import csv
import nltk
import spacy
import en_core_web_sm
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
# Check library versions
print(f"pandas version: {pd.__version__}")
print(f"numpy version: {np.__version__}")
print(f"nltk version: {nltk.__version__}")
print(f"spacy version: {spacy.__version__}")
print(f"en_core_web_sm version: {en_core_web_sm.__version__}")

pandas version: 2.2.2
numpy version: 1.26.4
nltk version: 3.9.1
spacy version: 3.7.5
en_core_web_sm version: 3.7.1


In [None]:
# This function is used to load and read a UTF-8 encoded CSV file in chunks and select a random sample of rows from the CSV
def load_random_subset(file_path, sample_size, chunksize=138, encoding='utf-8'):
    # Get total number of rows and preview content
    with open(file_path, 'r', encoding=encoding, errors='replace') as f:
        csv_reader = csv.reader(f)
        header = next(csv_reader)  # Read the header
        print(f"CSV Header: {header}")

        # Preview first few rows
        print("Preview of first 5 rows:")
        for _ in range(5):
            try:
                row = next(csv_reader)
                print(row)
            except StopIteration:
                break

        # Count total rows
        f.seek(0)  # Move file pointer back to the start
        csv_reader = csv.reader(f)  # Create a new csv_reader
        next(csv_reader)  # Skip the header
        total_rows = sum(1 for _ in csv_reader)  # Count rows

    print(f"Total rows in file: {total_rows}")

    if total_rows < sample_size:
        print(f"Warning: Sample size ({sample_size}) is larger than total rows ({total_rows}). Adjusting sample size.")
        sample_size = total_rows

    # Generate random row indices for selection
    random_indices = np.sort(np.random.choice(range(1, total_rows + 1), size=sample_size, replace=False))
    print(f"Randomly selected row indices: {random_indices}")

    # Initialize variables
    current_index = 0
    selected_rows = []

    # Read the CSV in chunks
    for chunk in pd.read_csv(file_path, chunksize=chunksize, encoding=encoding, on_bad_lines='skip'):
        print(f"Processing chunk from row {current_index + 1} to {current_index + len(chunk)}")

        # Find which rows from this chunk we want to select as a sample
        chunk_indices = random_indices[(random_indices > current_index) &
                                       (random_indices <= current_index + len(chunk))] - current_index - 1

        # Add selected rows to our list
        selected_chunk = chunk.iloc[chunk_indices]
        selected_rows.append(selected_chunk)
        print(f"Selected {len(selected_chunk)} rows from this chunk")

        # Move our current_index
        current_index += len(chunk)

        # Break if we've read all the rows we need
        if current_index > random_indices[-1]:
            break

    # Combine all selected rows into a single DataFrame
    result = pd.concat(selected_rows)
    print(f"Final number of rows selected: {len(result)}")
    return result

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
file_path = '/content/drive/MyDrive/ITAI 2277/AI Resume Prescreener/Resumes1.csv'
sample_size = 20

In [None]:
sampled_data = load_random_subset(file_path, sample_size, encoding='utf-8')

CSV Header: ['ID', 'Resume_str', 'Resume_html', 'Category']
Preview of first 5 rows:
['16852973', "         HR ADMINISTRATOR/MARKETING ASSOCIATE\n\nHR ADMINISTRATOR       Summary     Dedicated Customer Service Manager with 15+ years of experience in Hospitality and Customer Service Management.   Respected builder and leader of customer-focused teams; strives to instill a shared, enthusiastic commitment to customer service.         Highlights         Focused on customer satisfaction  Team management  Marketing savvy  Conflict resolution techniques     Training and development  Skilled multi-tasker  Client relations specialist           Accomplishments      Missouri DOT Supervisor Training Certification  Certified by IHG in Customer Loyalty and Marketing by Segment   Hilton Worldwide General Manager Training Certification  Accomplished Trainer for cross server hospitality systems such as    Hilton OnQ  ,   Micros    Opera PMS   , Fidelio    OPERA    Reservation System (ORS) ,   Holidex  

In [None]:
print("First 5 Sample Selections:")
sampled_data.head()

First 5 Sample Selections:


Unnamed: 0,ID,Resume_str,Resume_html,Category
23,12786012,HR COORDINATOR Summary Cert...,"<div class=""fontsize fontface vmargins hmargin...",HR
47,26202430,HR CONSULTANT Summary Sub...,"<div class=""fontsize fontface vmargins hmargin...",HR
150,23951429,GRAPHIC DESIGNER Summary Ve...,"<div class=""fontsize fontface vmargins hmargin...",DESIGNER
183,19195747,KICHEN/BATH DESIGNER & SALES Pr...,"<div class=""fontsize fontface vmargins hmargin...",DESIGNER
195,26503829,SENIOR TECHNICAL DESIGNER Summa...,"<div class=""fontsize fontface vmargins hmargin...",DESIGNER


In [None]:
# Loading a small english language model to be used by SpaCy
nlp = en_core_web_sm.load()

In [None]:
# Initializing NLTK lemmatizer
lemmatizer = WordNetLemmatizer()

Start trying cleaning methods below here. The subset of data can be used with the variable name "sampled_data"

In [None]:
sampled_data.isna().sum()

Unnamed: 0,0
ID,0
Resume_str,0
Resume_html,0
Category,0


In [None]:
sampled_data[sampled_data.duplicated()].sum()

Unnamed: 0,0
ID,0
Resume_str,0
Resume_html,0
Category,0


In [None]:
def preprocess_text_spacy(text, nlp):
    """
    Preprocess text using SpaCy

    Args:
        text (str): Input text to preprocess
        nlp: Loaded SpaCy model

    Returns:
        str: Preprocessed text
    """
    # Process the text with SpaCy
    doc = nlp(text.lower())

    # Tokenization and lemmatization
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]

    # Join tokens back into a string
    processed_text = ' '.join(tokens)

    return processed_text

In [None]:
def preprocess_resume_data(df, text_column='Resume_str'):
    """
    Preprocess resume text data

    Args:
        df (pd.DataFrame): Input DataFrame containing resume data
        text_column (str): Name of column containing text to process

    Returns:
        pd.DataFrame: DataFrame with additional processed text column
    """
    # Create a copy of the DataFrame
    processed_df = df.copy()

    # Load SpaCy model
    nlp = spacy.load('en_core_web_sm')

    # Add new column with processed text
    processed_df['processed_text'] = processed_df[text_column].apply(
        lambda x: preprocess_text_spacy(x, nlp)
    )

    return processed_df

In [None]:
processed_data = preprocess_resume_data(sampled_data)

In [None]:
print(processed_data.head())

           ID                                         Resume_str  \
23   12786012           HR COORDINATOR       Summary     Cert...   
47   26202430           HR CONSULTANT       Summary       Sub...   
150  23951429           GRAPHIC DESIGNER       Summary     Ve...   
183  19195747           KICHEN/BATH DESIGNER & SALES       Pr...   
195  26503829           SENIOR TECHNICAL DESIGNER       Summa...   

                                           Resume_html  Category  \
23   <div class="fontsize fontface vmargins hmargin...        HR   
47   <div class="fontsize fontface vmargins hmargin...        HR   
150  <div class="fontsize fontface vmargins hmargin...  DESIGNER   
183  <div class="fontsize fontface vmargins hmargin...  DESIGNER   
195  <div class="fontsize fontface vmargins hmargin...  DESIGNER   

                                        processed_text  
23   hr coordinator summary certify human resource ...  
47   hr consultant summary subject matter expert hr...  
150  graphi

In [None]:
processed_data.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category,processed_text
23,12786012,HR COORDINATOR Summary Cert...,"<div class=""fontsize fontface vmargins hmargin...",HR,hr coordinator summary certify human resource ...
47,26202430,HR CONSULTANT Summary Sub...,"<div class=""fontsize fontface vmargins hmargin...",HR,hr consultant summary subject matter expert hr...
150,23951429,GRAPHIC DESIGNER Summary Ve...,"<div class=""fontsize fontface vmargins hmargin...",DESIGNER,graphic designer summary versatile professiona...
183,19195747,KICHEN/BATH DESIGNER & SALES Pr...,"<div class=""fontsize fontface vmargins hmargin...",DESIGNER,kichen bath designer sale professional summary...
195,26503829,SENIOR TECHNICAL DESIGNER Summa...,"<div class=""fontsize fontface vmargins hmargin...",DESIGNER,senior technical designer summary product deve...


In [None]:
sampled_data['Resume_str'].head()

Unnamed: 0,Resume_str
23,HR COORDINATOR Summary Cert...
47,HR CONSULTANT Summary Sub...
150,GRAPHIC DESIGNER Summary Ve...
183,KICHEN/BATH DESIGNER & SALES Pr...
195,SENIOR TECHNICAL DESIGNER Summa...


In [None]:
# Set pandas display options to show more text
pd.set_option('display.max_colwidth', None)  # Remove column width restriction
pd.set_option('display.max_rows', None)      # Show all rows
pd.set_option('display.max_columns', None)

In [None]:
# Display comparison of original and processed text for first few resumes
for idx in range(3):  # Change number to show more/fewer examples
    print(f"\nResume #{idx + 1}")
    print("-" * 80)
    print("\nORIGINAL TEXT:")
    print(processed_data['Resume_str'].iloc[idx][:1000])  # Increase number to show more text
    print("\nPROCESSED TEXT:")
    print(processed_data['processed_text'].iloc[idx][:1000])  # Increase number to show more text
    print("\n" + "=" * 80)


Resume #1
--------------------------------------------------------------------------------

ORIGINAL TEXT:
         HR COORDINATOR       Summary     Certified Human Resources Professional with extensive employee relations experience in a full range of functions, as well as success in simultaneously managing multiple projects.        Highlights          Employee relations  Compensation administration  Personnel records maintenance  New hire orientation  Hiring and retention  Training and development  Compensation/payroll  Staffing and recruiting professional  Off-boarding  Interviewing expertise  Performance management strategies      Benefits administrator  Employment law knowledge  HRIS applications proficient  Employee handbook development  New employee orientations  Human resources audits  Maintains confidentiality  HR policies and procedures expertise  Human resources management  Excellent interpersonal and coaching skills  Certified Professional Human Resource Management         