In [38]:
import pandas as pd

In [39]:
df = pd.read_csv("data/jobs_dataset_with_features.csv")

In [40]:
df.head(2)

Unnamed: 0,Role,Features
0,Social Media Manager,5 to 15 Years Digital Marketing Specialist M.T...
1,Frontend Web Developer,"2 to 12 Years Web Developer BCA HTML, CSS, Jav..."


In [41]:
df.columns

Index(['Role', 'Features'], dtype='object')

In [42]:
for value in df['Role'].value_counts():
    print(value)

20580
17470
14036
13945
13935
13757
10659
10541
10512
10496
10482
10407
10406
10404
10362
10361
10308
10308
10305
7063
7052
7028
7027
7021
7020
7016
7014
7004
7003
7002
7000
6988
6983
6979
6975
6974
6965
6959
6955
6941
6939
6935
6932
6929
6926
6918
6913
6906
6903
6902
6898
6897
6878
6857
6852
6850
6839
6830
6803
6799
6743
3645
3607
3602
3600
3598
3594
3589
3584
3583
3579
3576
3575
3572
3570
3569
3569
3563
3563
3562
3559
3556
3556
3555
3554
3554
3552
3552
3550
3549
3548
3548
3547
3545
3545
3544
3542
3541
3541
3540
3540
3540
3540
3540
3538
3536
3536
3535
3534
3533
3531
3530
3530
3530
3529
3529
3527
3526
3526
3525
3525
3524
3523
3523
3522
3522
3521
3521
3521
3520
3520
3520
3518
3518
3518
3518
3518
3517
3516
3516
3516
3515
3515
3515
3514
3513
3513
3513
3513
3512
3512
3512
3512
3511
3510
3509
3509
3509
3508
3508
3508
3508
3507
3507
3505
3505
3504
3504
3503
3503
3499
3499
3499
3498
3498
3498
3497
3497
3496
3496
3495
3494
3494
3493
3493
3492
3492
3492
3492
3492
3492
3492
3492
3492
3491
3491
3

In [43]:
df.shape

(1615940, 2)

In [44]:
df.head()

Unnamed: 0,Role,Features
0,Social Media Manager,5 to 15 Years Digital Marketing Specialist M.T...
1,Frontend Web Developer,"2 to 12 Years Web Developer BCA HTML, CSS, Jav..."
2,Quality Control Manager,0 to 12 Years Operations Manager PhD Quality c...
3,Wireless Network Engineer,4 to 11 Years Network Engineer PhD Wireless ne...
4,Conference Manager,1 to 12 Years Event Manager MBA Event planning...


In [45]:
# Dropping classes with less than 6500 instances
min_count = 6500
role_counts = df['Role'].value_counts()
dropped_classes = role_counts[role_counts < min_count].index
df = df[~df['Role'].isin(dropped_classes)].reset_index(drop=True)

# Checking the updated role counts
df['Role'].value_counts()

Interaction Designer          20580
Network Administrator         17470
User Interface Designer       14036
Social Media Manager          13945
User Experience Designer      13935
                              ...  
Benefits Coordinator           6839
Research Analyst               6830
Administrative Coordinator     6803
IT Support Specialist          6799
UI/UX Designer                 6743
Name: Role, Length: 61, dtype: int64

In [46]:
from sklearn.utils import resample

# Define the maximum count among all categories
max_count = df['Role'].value_counts().max()

# Resample each category to match the maximum count
balanced_data = []
for category in df['Role'].unique():
    category_data = df[df['Role'] == category]
    if len(category_data) < max_count:
        # Perform oversampling for categories with fewer samples
        balanced_category_data = resample(category_data, replace=True, n_samples=max_count, random_state=42)
    else:
        # Perform undersampling for categories with more samples
        balanced_category_data = resample(category_data, replace=False, n_samples=max_count, random_state=42)
    balanced_data.append(balanced_category_data)

# Concatenate the balanced data for all categories
filtered_df = pd.concat(balanced_data)

In [47]:
len(filtered_df['Role'].value_counts())

61

In [48]:
filtered_df.shape

(1255380, 2)

In [49]:
# df = filtered_df.sample(n=10000)

In [50]:
filtered_df.head()

Unnamed: 0,Role,Features
267322,Social Media Manager,5 to 11 Years Digital Marketing Specialist BCA...
31847,Social Media Manager,5 to 13 Years Digital Marketing Specialist MCA...
199738,Social Media Manager,3 to 14 Years Digital Marketing Specialist PhD...
500922,Social Media Manager,4 to 8 Years Digital Marketing Specialist BCA ...
192634,Social Media Manager,0 to 15 Years Digital Marketing Specialist BBA...


# TFIDF

In [51]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Splitting the data into features (X) and target (y)
X = filtered_df['Features']
y = filtered_df['Role']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [52]:
# RandomForestClassifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_tfidf, y_train)

# Predictions
y_pred = rf_classifier.predict(X_test_tfidf)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0


# Recommendation

In [53]:
# Clean resume
import re
def cleanResume(txt):
    cleanText = re.sub('http\S+\s', ' ', txt)
    cleanText = re.sub('RT|cc', ' ', cleanText)
    cleanText = re.sub('#\S+\s', ' ', cleanText)
    cleanText = re.sub('@\S+', '  ', cleanText)  
    cleanText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', cleanText)
    cleanText = re.sub(r'[^\x00-\x7f]', ' ', cleanText) 
    cleanText = re.sub('\s+', ' ', cleanText)
    return cleanText


# Prediction and Category Name
def job_recommendation(resume_text):
    resume_text= cleanResume(resume_text)
    resume_tfidf = tfidf_vectorizer.transform([resume_text])
    predicted_category = rf_classifier.predict(resume_tfidf)[0]
    return predicted_category

In [54]:
# Example Usage
resume_file = """Objective:
A creative and detail-oriented Designer with a passion for visual communication and brand identity seeking opportunities to leverage design skills in a dynamic and collaborative environment.

Education:
- Bachelor of Fine Arts in Graphic Design, XYZ College, GPA: 3.7/4.0
- Diploma in Web Design, ABC Institute, GPA: 3.9/4.0

Skills:
- Proficient in Adobe Creative Suite (Photoshop, Illustrator, InDesign)
- Strong understanding of typography, layout, and color theory
- Experience in both print and digital design
- Ability to conceptualize and execute design projects from concept to completion
- Excellent attention to detail and time management skills

Experience:
Graphic Designer | XYZ Design Studio
- Created visually appealing graphics for various marketing materials, including brochures, flyers, and social media posts
- Collaborated with clients to understand their design needs and deliver creative solutions that align with their brand identity
- Worked closely with the marketing team to ensure consistency in brand messaging across all platforms

Freelance Designer
- Designed logos, branding materials, and website layouts for small businesses and startups
- Managed multiple projects simultaneously while meeting tight deadlines and maintaining quality standards
- Established and maintained strong client relationships through clear communication and exceptional service

Projects:
- Rebranding Campaign for XYZ Company: Led a team to redesign the company's logo, website, and marketing collateral, resulting in a 30% increase in brand recognition
- Packaging Design for ABC Product Launch: Developed eye-catching packaging designs for a new product line, contributing to a successful launch and positive customer feedback

Certifications:
- Adobe Certified Expert (ACE) in Adobe Illustrator
- Responsive Web Design Certification from Udemy

Languages:
- English (Native)
- Spanish (Intermediate)
"""
predicted_category = job_recommendation(resume_file)
print("Predicted Category:", predicted_category)

Predicted Category: User Interface Designer


In [55]:
# Example Usage
resume_file = """Objective:
Dedicated and results-oriented Banking professional with a strong background in financial analysis and customer service seeking opportunities to contribute to a reputable financial institution. Eager to leverage expertise in risk management, investment strategies, and relationship building to drive business growth and client satisfaction.

Education:
- Bachelor of Business Administration in Finance, XYZ University, GPA: 3.8/4.0
- Certified Financial Analyst (CFA) Level I Candidate

Skills:
- Proficient in financial modeling and analysis using Excel, Bloomberg Terminal, and other financial software
- Extensive knowledge of banking products and services, including loans, mortgages, and investment products
- Strong understanding of regulatory compliance and risk management practices in the banking industry
- Excellent communication and interpersonal skills, with a focus on building rapport with clients and colleagues
- Ability to work efficiently under pressure and adapt to changing market conditions

Experience:
Financial Analyst | ABC Bank
- Conducted financial analysis and risk assessment for corporate clients, including credit analysis, financial statement analysis, and cash flow modeling
- Developed customized financial solutions to meet clients' needs and objectives, resulting in increased revenue and client retention
- Collaborated with cross-functional teams to identify new business opportunities and optimize existing processes

Customer Service Representative | DEF Bank
- Provided exceptional customer service to bank clients, addressing inquiries, resolving issues, and promoting banking products and services
- Processed transactions accurately and efficiently, including deposits, withdrawals, and account transfers
- Educated customers on various banking products and services, helping them make informed financial decisions

Internship | GHI Investments
- Assisted portfolio managers with investment research and analysis, including industry and company-specific research, financial modeling, and performance analysis
- Prepared investment presentations and reports for clients, highlighting investment opportunities and performance metrics
- Conducted market research and analysis to identify trends and opportunities in the financial markets

Certifications:
- Certified Financial Planner (CFP)
- Series 7 and Series 63 Securities Licenses

Languages:
- English (Native)
- Spanish (Proficient)

"""
predicted_category = job_recommendation(resume_file)
print("Predicted Category:", predicted_category)

Predicted Category: Financial Analyst


In [57]:
import pickle
pickle.dump(rf_classifier,open('models/rf_classifier_job_recommendation6500.pkl','wb'))
pickle.dump(tfidf_vectorizer,open('models/tfidf_vectorizer_job_recommendation6500.pkl','wb'))