**This notebook is for testing methods to load the datasets that we will be using.**

In [None]:
import pandas as pd
import numpy as np
import random
import csv

#### Make sure you create a shortcut for the "AI Resume Prescreener" folder in your personal drive so that the dataset files can be accessed when the drive is mounted. First make a new folder in your personal drive titled "ITAI 2277", then create a shortcut for the shared folder and add it to that new folder. This ensures that the path to the files written in the code are correct.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
random.seed()

In [None]:
# This function is used to load and read a UTF-8 encoded CSV file in chunks and select a random sample of rows from the CSV
def load_random_subset(file_path, sample_size, chunksize=138, encoding='utf-8'):
    # Get total number of rows and preview content
    with open(file_path, 'r', encoding=encoding, errors='replace') as f:
        csv_reader = csv.reader(f)
        header = next(csv_reader)  # Read the header
        print(f"CSV Header: {header}")

        # Preview first few rows
        print("Preview of first 5 rows:")
        for _ in range(5):
            try:
                row = next(csv_reader)
                print(row)
            except StopIteration:
                break

        # Count total rows
        f.seek(0)  # Move file pointer back to the start
        csv_reader = csv.reader(f)  # Create a new csv_reader
        next(csv_reader)  # Skip the header
        total_rows = sum(1 for _ in csv_reader)  # Count rows

    print(f"Total rows in file: {total_rows}")

    if total_rows < sample_size:
        print(f"Warning: Sample size ({sample_size}) is larger than total rows ({total_rows}). Adjusting sample size.")
        sample_size = total_rows

    # Generate random row indices for selection
    random_indices = np.sort(np.random.choice(range(1, total_rows + 1), size=sample_size, replace=False))
    print(f"Randomly selected row indices: {random_indices}")

    # Initialize variables
    current_index = 0
    selected_rows = []

    # Read the CSV in chunks
    for chunk in pd.read_csv(file_path, chunksize=chunksize, encoding=encoding, on_bad_lines='skip'):
        print(f"Processing chunk from row {current_index + 1} to {current_index + len(chunk)}")

        # Find which rows from this chunk we want to select as a sample
        chunk_indices = random_indices[(random_indices > current_index) &
                                       (random_indices <= current_index + len(chunk))] - current_index - 1

        # Add selected rows to our list
        selected_chunk = chunk.iloc[chunk_indices]
        selected_rows.append(selected_chunk)
        print(f"Selected {len(selected_chunk)} rows from this chunk")

        # Move our current_index
        current_index += len(chunk)

        # Break if we've read all the rows we need
        if current_index > random_indices[-1]:
            break

    # Combine all selected rows into a single DataFrame
    result = pd.concat(selected_rows)
    print(f"Final number of rows selected: {len(result)}")
    return result

In [None]:
# This can use a dataset that has been uploaded to the session files.
# Not recommended because file has to be uploaded each time a new session is started
# Can also be used to load local file in local environment
'''
file_path = 'Resume(1snehaan).csv'
sample_size = 20
'''

In [None]:
file_path = '/content/drive/MyDrive/ITAI 2277/AI Resume Prescreener/Resumes1.csv'
sample_size = 20

In [None]:
try:
    sampled_data = load_random_subset(file_path, sample_size, encoding='utf-8')
    print("First 5 Sample Selections:")
    print(sampled_data.head())
except Exception as e:
    print(f"An error occurred: {str(e)}")

CSV Header: ['ID', 'Resume_str', 'Resume_html', 'Category']
Preview of first 5 rows:
['16852973', "         HR ADMINISTRATOR/MARKETING ASSOCIATE\n\nHR ADMINISTRATOR       Summary     Dedicated Customer Service Manager with 15+ years of experience in Hospitality and Customer Service Management.   Respected builder and leader of customer-focused teams; strives to instill a shared, enthusiastic commitment to customer service.         Highlights         Focused on customer satisfaction  Team management  Marketing savvy  Conflict resolution techniques     Training and development  Skilled multi-tasker  Client relations specialist           Accomplishments      Missouri DOT Supervisor Training Certification  Certified by IHG in Customer Loyalty and Marketing by Segment   Hilton Worldwide General Manager Training Certification  Accomplished Trainer for cross server hospitality systems such as    Hilton OnQ  ,   Micros    Opera PMS   , Fidelio    OPERA    Reservation System (ORS) ,   Holidex  