In [4]:
# Step 1: Data Collection and Preprocessing

# Required Libraries
import os
import re
import pandas as pd
from PyPDF2 import PdfReader
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# Function to read PDF files and convert to text
def read_pdf(file_path):
    pdf_text = ''
    try:
        with open(file_path, 'rb') as file:
            reader = PdfReader(file)
            for page in reader.pages:
                pdf_text += page.extract_text()
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
    return pdf_text

# Function to clean and preprocess text data
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\n', ' ', text)  # Replace newlines with space
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'[^a-z0-9 ]', '', text)  # Remove special characters
    return text.strip()

# Function to recursively find all files with a given extension in a directory
def find_all_files(directory, extension):
    files = []
    for root, dirs, files_list in os.walk(directory):
        for file in files_list:
            if file.lower().endswith(extension):
                files.append(os.path.join(root, file))
    return files

# Root directory containing folders with job categories and resumes
root_dir = 'data'

# List to hold the preprocessed data
data = []

# Find all PDF and text files in the root directory and its subdirectories
pdf_files = find_all_files(root_dir, '.pdf')
text_files = find_all_files(root_dir, '.txt')

# Process PDF resumes
for file_path in tqdm(pdf_files, desc="Processing PDF resumes"):
    file_name = os.path.basename(file_path)
    job_category = os.path.basename(os.path.dirname(file_path))
    text = read_pdf(file_path)
    cleaned_text = clean_text(text)
    data.append({'file_name': file_name, 'text': cleaned_text, 'format': 'pdf', 'job_category': job_category})

# Process text resumes
for file_path in tqdm(text_files, desc="Processing text resumes"):
    file_name = os.path.basename(file_path)
    job_category = os.path.basename(os.path.dirname(file_path))
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        cleaned_text = clean_text(text)
        data.append({'file_name': file_name, 'text': cleaned_text, 'format': 'text', 'job_category': job_category})

# Converting list to DataFrame
df = pd.DataFrame(data)

# Save the preprocessed data to a CSV file
df.to_csv('preprocessed_resumes.csv', index=False)

# Split the dataset into training, validation, and test sets
train, temp = train_test_split(df, test_size=0.3, random_state=42)
val, test = train_test_split(temp, test_size=0.5, random_state=42)

# Save the split datasets
train.to_csv('train.csv', index=False)
val.to_csv('val.csv', index=False)
test.to_csv('test.csv', index=False)

print("Data collection and preprocessing complete.")


Processing PDF resumes: 100%|██████████████████████████████████████████████████████| 2484/2484 [07:43<00:00,  5.36it/s]
Processing text resumes: 0it [00:00, ?it/s]


Data collection and preprocessing complete.
