
# AI Assisted Recruitment September Tasks for Resume data

This notebook focuses only on your provided Resume dataset and mirrors common lab steps for data understanding and preparation.

**Outputs created**
- Resume_clean.csv
- Resume_train.csv
- Resume_test.csv
- Resume_sample.csv
- label_map.json when Category exists
- eda_resume_length.png
- eda_category_distribution.png
- resume_cleaning_report.json



## Setup

Pick one method to bring the CSV into Colab and set the paths.


In [None]:
# manual csv upload
#from google.colab import files
#uploaded = files.upload()
#INPUT_CSV = "Resume.csv"

In [None]:
# Defaults for Colab runtime
INPUT_CSV = "/content/Resume.csv"
OUT_DIR = "/content"

%matplotlib inline



## Imports


In [None]:

import os
import re
import json
from typing import List, Tuple

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [None]:
# Load the dataset
df = pd.read_csv("Resume.csv", encoding="utf-8")

# Preview first rows
print("Preview of dataset:")
display(df.head())

# Shape of the dataset
print("\nShape of dataset:", df.shape)

# Column info
print("\nColumn information:")
print(df.info())

# Check for missing values
print("\nMissing values per column:")
print(df.isna().sum())

# Look at duplicate rows
print("\nNumber of duplicate rows:", df.duplicated().sum())

# Quick look at category distribution if present
if "Category" in df.columns:
    print("\nCategory distribution:")
    print(df["Category"].value_counts())

Preview of dataset:


Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR



Shape of dataset: (2484, 4)

Column information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2484 entries, 0 to 2483
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           2484 non-null   int64 
 1   Resume_str   2484 non-null   object
 2   Resume_html  2484 non-null   object
 3   Category     2484 non-null   object
dtypes: int64(1), object(3)
memory usage: 77.8+ KB
None

Missing values per column:
ID             0
Resume_str     0
Resume_html    0
Category       0
dtype: int64

Number of duplicate rows: 0

Category distribution:
Category
INFORMATION-TECHNOLOGY    120
BUSINESS-DEVELOPMENT      120
ADVOCATE                  118
CHEF                      118
ENGINEERING               118
ACCOUNTANT                118
FINANCE                   118
FITNESS                   117
AVIATION                  117
SALES                     116
BANKING                   115
HEALTHCARE                115
CONSULTANT 


## Helper functions


In [None]:

BASIC_STOPWORDS = {
    "the","and","for","with","that","this","from","are","was","were","been","being","will","would","shall","should",
    "can","could","a","an","in","on","at","by","to","of","or","as","is","am","be","it","its","into","over","due",
    "than","then","but","so","if","not","no","yes","we","our","you","your","they","their","he","she","him","her",
    "them","i","me","my","mine","us","about","within","per","using","use","used","via","etc","such","include",
    "including","through","across","under","up","down","out","off","new","make","made","also","may","must","strong",
    "skills","skill","responsible","responsibility","responsibilities","experience","experienced","years","year",
    "work","working","company","companies","project","projects","objective","summary","professional","proficient",
    "knowledge","excellent","good","great","highly","ability","abilities","team","teams","member","members","detail",
    "details","detailed","analysis","analyze","analyst","manager","management","lead","led","leadership","support",
    "supported","supporting","develop","developed","development","design","designed","implement","implemented",
    "implementation","maintain","maintained","maintenance","deliver","delivered","delivery","result","results"
}

def regex_tokenize(text: str) -> List[str]:
    return [t for t in re.split(r"[^A-Za-z0-9]+", text) if t]

def clean_html(raw_html: str) -> str:
    if pd.isna(raw_html):
        return ""
    return re.sub(r"<[^>]+>", " ", raw_html)

def basic_text_clean(text: str) -> str:
    if pd.isna(text):
        return ""
    text = text.replace("\\xa0", " ")
    text = re.sub(r"http\\S+|www\\.\\S+", " ", text)
    text = re.sub(r"\\b[\\w\\.-]+@[\\w\\.-]+\\.\\w+\\b", " ", text)
    text = text.encode("ascii", "ignore").decode()
    text = re.sub(r"[^A-Za-z0-9\\s\\.\\,\\!\\?\\-]", " ", text)
    text = text.lower()
    text = re.sub(r"\\s+", " ", text).strip()
    return text

def normalize_tokens(text: str, stop_words: set) -> List[str]:
    toks = regex_tokenize(text)
    return [t for t in toks if t not in stop_words and len(t) > 2]

def eda_plots(df: pd.DataFrame, out_dir: str, text_col: str, label_col: str = None) -> None:
    os.makedirs(out_dir, exist_ok=True)

    lengths = df[text_col].fillna("").str.split().map(len)
    plt.figure()
    lengths.hist(bins=40)
    plt.title("Resume token length distribution")
    plt.xlabel("tokens")
    plt.ylabel("count")
    plt.tight_layout()
    plt.savefig(os.path.join(out_dir, "eda_resume_length.png"))
    plt.close()

    if label_col and label_col in df.columns:
        plt.figure()
        df[label_col].value_counts().sort_values(ascending=True).plot(kind="barh")
        plt.title("Category distribution")
        plt.xlabel("count")
        plt.ylabel("category")
        plt.tight_layout()
        plt.savefig(os.path.join(out_dir, "eda_category_distribution.png"))
        plt.close()

def stratified_split_if_possible(df: pd.DataFrame, test_size: float = 0.2, random_state: int = 42) -> Tuple[pd.DataFrame, pd.DataFrame]:
    try:
        if "Category" in df.columns and df["Category"].notna().all():
            from sklearn.model_selection import train_test_split as sk_split
            return sk_split(df, test_size=test_size, random_state=random_state, stratify=df["Category"])
    except Exception as e:
        print("Stratified split failed, falling back to random split. Reason:", e)

    df = df.sample(frac=1.0, random_state=random_state).reset_index(drop=True)
    cut = int(len(df) * (1.0 - test_size))
    return df.iloc[:cut].copy(), df.iloc[cut:].copy()



## Part A data understanding

Load data and preview structure. Create a compact data dictionary. Check missingness and duplicates.


In [None]:

assert os.path.exists(INPUT_CSV), f"File not found at {INPUT_CSV}. Update INPUT_CSV above."
raw = pd.read_csv(INPUT_CSV, encoding="utf-8", engine="python")

display(raw.head())
print("shape:", raw.shape)
print("columns:", list(raw.columns))

dd = pd.DataFrame({
    "column": raw.columns,
    "dtype": [str(raw[c].dtype) for c in raw.columns],
    "non_null": [int(raw[c].notna().sum()) for c in raw.columns],
    "nulls": [int(raw[c].isna().sum()) for c in raw.columns],
    "example": [str(raw[c].dropna().iloc[0]) if raw[c].notna().any() else "" for c in raw.columns],
})
dd


Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


shape: (2484, 4)
columns: ['ID', 'Resume_str', 'Resume_html', 'Category']


Unnamed: 0,column,dtype,non_null,nulls,example
0,ID,int64,2484,0,16852973
1,Resume_str,object,2484,0,HR ADMINISTRATOR/MARKETING ASSOCIATE\...
2,Resume_html,object,2484,0,"<div class=""fontsize fontface vmargins hmargin..."
3,Category,object,2484,0,HR


In [None]:

missing_pct = raw.isna().mean().sort_values(ascending=False) * 100.0
dup_rows = raw.duplicated().sum()
dup_by_text = 0
text_cols = [c for c in ["Resume_str", "Resume_html", "Resume", "resume", "text"] if c in raw.columns]
if text_cols:
    dup_by_text = raw.duplicated(subset=[text_cols[0]]).sum()

print("total duplicate rows:", dup_rows)
print("duplicate rows by text col:", dup_by_text)
missing_pct.to_frame("missing_percent")


total duplicate rows: 0
duplicate rows by text col: 2


Unnamed: 0,missing_percent
ID,0.0
Resume_str,0.0
Resume_html,0.0
Category,0.0



## Part B data preparation

Pick text column, strip html if needed, normalize text, tokenize, compute token length, optionally encode labels.


In [None]:

text_col = None
for cand in ["Resume_str", "Resume_html", "Resume", "resume", "text"]:
    if cand in raw.columns:
        text_col = cand
        break
assert text_col is not None, "Could not find a resume text column"

df = raw.copy()
if text_col == "Resume_html":
    df["resume_text"] = df["Resume_html"].map(clean_html)
else:
    df["resume_text"] = df[text_col].astype(str)

before = len(df)
df = df.drop_duplicates(subset=["resume_text"]).reset_index(drop=True)
removed_dups = before - len(df)

df["resume_text"] = df["resume_text"].fillna("")
df = df[df["resume_text"].str.strip().ne("")].reset_index(drop=True)

df["clean_text"] = df["resume_text"].map(basic_text_clean)

stop_words = set(BASIC_STOPWORDS)
df["tokens"] = df["clean_text"].map(lambda t: normalize_tokens(t, stop_words))

df["token_len"] = df["tokens"].map(len)

label_map = None
if "Category" in df.columns:
    df["Category"] = df["Category"].astype(str).str.strip()
    categories = sorted(df["Category"].unique())
    label_map = {c: i for i, c in enumerate(categories)}
    df["label"] = df["Category"].map(label_map)

display(df.head())
print("rows after cleaning:", len(df))
print("duplicates removed:", removed_dups)


Unnamed: 0,ID,Resume_str,Resume_html,Category,resume_text,clean_text,tokens,token_len,label
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,hr administrator marketing associate hr admin...,"[administrator, marketing, associate, administ...",464,19
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR,"HR SPECIALIST, US HR OPERATIONS ...","hr specialist, us hr operations summary ...","[specialist, operations, versatile, media, bac...",486,19
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR,HR DIRECTOR Summary Over 2...,hr director summary over 20 years e...,"[director, recruiting, plus, human, resources,...",647,19
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR,HR SPECIALIST Summary Dedica...,"hr specialist summary dedicated, driv...","[specialist, dedicated, driven, dynamic, custo...",231,19
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR,HR MANAGER Skill Highlights ...,hr manager skill highlights ...,"[highlights, department, startup, three, organ...",770,19


rows after cleaning: 2481
duplicates removed: 2



## Part C exploratory plots


In [None]:

eda_plots(df, OUT_DIR, text_col="clean_text", label_col="Category" if "Category" in df.columns else None)
print("Saved plots to:", OUT_DIR)


Saved plots to: /content



## Part D save outputs
Create cleaned data, sample, stratified train and test splits, and a compact JSON report.


In [None]:

core_cols = []
if "ID" in df.columns:
    core_cols.append("ID")
if "Category" in df.columns:
    core_cols.extend(["Category", "label"])
core_cols.extend(["clean_text", "token_len", "tokens"])
df_out = df[core_cols]

clean_path = os.path.join(OUT_DIR, "Resume_clean.csv")
train_path = os.path.join(OUT_DIR, "Resume_train.csv")
test_path  = os.path.join(OUT_DIR, "Resume_test.csv")
sample_path = os.path.join(OUT_DIR, "Resume_sample.csv")

df_out.to_csv(clean_path, index=False)
df_out.sample(n=min(25, len(df_out)), random_state=42).to_csv(sample_path, index=False)

train_df, test_df = stratified_split_if_possible(df_out, test_size=0.2, random_state=42)
train_df.to_csv(train_path, index=False)
test_df.to_csv(test_path, index=False)

if label_map is not None:
    with open(os.path.join(OUT_DIR, "label_map.json"), "w", encoding="utf-8") as f:
        json.dump(label_map, f, indent=2)

report = {
    "rows_after_cleaning": int(len(df_out)),
    "duplicates_removed": int(removed_dups),
    "avg_token_length": float(df_out["token_len"].mean()),
    "median_token_length": float(df_out["token_len"].median()),
    "label_count": int(len(label_map)) if label_map is not None else None
}
with open(os.path.join(OUT_DIR, "resume_cleaning_report.json"), "w", encoding="utf-8") as f:
    json.dump(report, f, indent=2)

print("Saved:")
print(clean_path)
print(train_path)
print(test_path)
print(sample_path)
if label_map is not None:
    print(os.path.join(OUT_DIR, "label_map.json"))
print(os.path.join(OUT_DIR, "resume_cleaning_report.json"))


Saved:
/content/Resume_clean.csv
/content/Resume_train.csv
/content/Resume_test.csv
/content/Resume_sample.csv
/content/label_map.json
/content/resume_cleaning_report.json



## Part E checklist

- Data loaded and previewed  
- Data dictionary created  
- Duplicates removed and missingness reviewed  
- Text normalized and tokenized  
- Basic class balance check  
- Clean CSV and splits saved  
- EDA images generated  
- Report JSON generated  
