In [1]:
import pandas as pd
from FeatureDataClass import IndustryData

# Fetch our primary Resume -> Industry dataset
ind = IndustryData().preprocess_data()

# Get raw counts
industry_counts = ind['Industry'].value_counts()
print("Raw counts:")
print(industry_counts)

# Get percentage distribution
industry_percentages = ind['Industry'].value_counts(normalize=True) * 100
print("\nPercentage distribution:")
print(industry_percentages.round(2))
ind

Raw counts:
Industry
Education                    410
Mechanical Engineer          384
Electrical Engineering       384
Consultant                   368
Civil Engineer               364
Sales                        364
Management                   361
Human Resources              360
Digital Media                358
Accountant                   350
Java Developer               348
Building and Construction    345
Operations Manager           345
Architecture                 344
Testing                      344
Business Analyst             340
Aviation                     340
Finance                      339
SQL Developer                338
Public Relations             337
Health and Fitness           332
Arts                         332
Network Security Engineer    330
DotNet Developer             329
Apparel                      320
Banking                      314
Automobile                   313
Web Designing                309
SAP Developer                304
Data Science          

Unnamed: 0,Industry,resume_text
0,Accountant,education omba executive leadership university...
1,Accountant,howard gerrard accountant deyjobcom birmingham...
2,Accountant,kevin frank senior accountant inforesumekraftc...
3,Accountant,place birth nationality olivia ogilvy accounta...
4,Accountant,stephen greet cpa senior accountant 9 year exp...
...,...,...
13384,Web Designing,jessica claire montgomery street san francisco...
13385,Web Designing,jessica claire montgomery street san francisco...
13386,Web Designing,summary jessica claire 100 montgomery st 10th ...
13387,Web Designing,jessica claire montgomery street san francisco...


It seems like there are a lot of IT related jobs which aren't labelled IT.

(Information Technology)
React Developer
Python Developer
DevOps
ETL Developer
Web Desiging
DotNet Developer
Network Security Engineer
SQL Developer
Java Developer

We could potentially group these into distinct classes, as it could be hard for the agent to pick up on these niche industry domains (which are moreso job titles as opposed to industries).


I'm proposing we perform the following splits

Under Information Technology, we will now include;
DevOps
SAP Developer
Database
SQL Developer
ETL Developer
Web Desiging

Creating a new class, Software Engineering, we will now include;
Java Developer
DotNet Developer
Network Security Engineer
React Developer
Python Developer

Creating a new class


In [6]:
# Define the mappings
it_roles = ['DevOps', 'SQL Developer', 'ETL Developer', 'Web Designing', 'Database', 'SAP Developer']
software_eng_roles = ['Java Developer', 'DotNet Developer', 'Network Security Engineer', 
                     'React Developer', 'Python Developer']

# Update the Industry column
ind.loc[ind['Industry'].isin(it_roles), 'Industry'] = 'Information Technology'
ind.loc[ind['Industry'].isin(software_eng_roles), 'Industry'] = 'Software Engineering'

# Check the updated distribution
print("Updated Industry Distribution:")
print(ind['Industry'].value_counts())
print("\nPercentages:")
print(ind['Industry'].value_counts(normalize=True) * 100)


Updated Industry Distribution:
Industry
Information Technology       2065
Software Engineering         1437
Education                     410
Mechanical Engineer           384
Electrical Engineering        384
Consultant                    368
Civil Engineer                364
Sales                         364
Management                    361
Human Resources               360
Digital Media                 358
Accountant                    350
Operations Manager            345
Building and Construction     345
Testing                       344
Architecture                  344
Aviation                      340
Business Analyst              340
Finance                       339
Public Relations              337
Health and Fitness            332
Arts                          332
Apparel                       320
Banking                       314
Automobile                    313
Data Science                  299
Agriculture                   293
Advocate                      291
PMO     

In [7]:
# Get all rows that aren't IT or Software Engineering
other_rows = ind[~ind['Industry'].isin(['Information Technology', 'Software Engineering'])]

# Randomly sample rows from each IT and Software Engineering
it_rows = ind[ind['Industry'] == 'Information Technology'].sample(n=650, random_state=42)
sw_rows = ind[ind['Industry'] == 'Software Engineering'].sample(n=600, random_state=42)

# Combine all the dataframes
df_balanced = pd.concat([other_rows, it_rows, sw_rows])

# Shuffle the final dataframe
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Check the new distribution
print("New Industry Distribution:")
print(df_balanced['Industry'].value_counts())
print("\nPercentages:")
print(df_balanced['Industry'].value_counts(normalize=True) * 100)

New Industry Distribution:
Industry
Information Technology       650
Software Engineering         600
Education                    410
Mechanical Engineer          384
Electrical Engineering       384
Consultant                   368
Civil Engineer               364
Sales                        364
Management                   361
Human Resources              360
Digital Media                358
Accountant                   350
Building and Construction    345
Operations Manager           345
Testing                      344
Architecture                 344
Business Analyst             340
Aviation                     340
Finance                      339
Public Relations             337
Arts                         332
Health and Fitness           332
Apparel                      320
Banking                      314
Automobile                   313
Data Science                 299
Agriculture                  293
Advocate                     291
PMO                          286
Designi

In [8]:
# Shuffle the rows so we don't have Industries clustered together
df_balanced = df_balanced.sample(frac=1, random_state=42) 
df_balanced

Unnamed: 0,Industry,resume_text
9359,BPO,links jessica claire montgomery street san fra...
483,Human Resources,jessica claire 9 resumesampleexamplecom 555 43...
8412,Consultant,roger kuo 201 sansome st 202 san francisco ca ...
8188,Health and Fitness,summary skills jessica claire 100 montgomery s...
7691,PMO,jessica claire 100 montgomery st 10th floor 55...
...,...,...
5734,Consultant,jessica claire resumesampleexamplecom 555 4321...
5191,Agriculture,kiel koelpin 6993 jacobson gardens philadelphi...
5390,Accountant,jessica claire 100 montgomery st 10th floor 55...
860,Human Resources,jessica claire 100 montgomery st 10th floor 55...


I'm also seeing duplicate entries, lets remove rows where there are duplicate resume_text data

In [9]:
# Now remove duplicates based on resume_text
df_balanced = df_balanced.drop_duplicates(subset=['resume_text']).sample(frac=1, random_state=42) 
df_balanced

# Check the new distribution
print("New Industry Distribution:")
print(df_balanced['Industry'].value_counts())
print("\nPercentages:")
print(df_balanced['Industry'].value_counts(normalize=True) * 100)

New Industry Distribution:
Industry
Information Technology       618
Software Engineering         559
Education                    388
Electrical Engineering       361
Consultant                   344
Sales                        343
Digital Media                339
Accountant                   336
Mechanical Engineer          336
Building and Construction    335
Finance                      332
Operations Manager           329
Aviation                     327
Testing                      323
Management                   319
Apparel                      315
Business Analyst             310
Public Relations             309
Civil Engineer               307
Architecture                 305
Human Resources              304
Automobile                   303
Health and Fitness           295
Banking                      292
Advocate                     280
Data Science                 274
Arts                         248
PMO                          247
Designing                    246
Agricul

In [19]:
df_save = df_balanced.sample(n=150, random_state=42)
df_save['experience_level'] = None
df_save.to_csv('local_datasets/ResumeFeatures.csv')