In [43]:
import pandas as pd
import numpy as np
import nltk
import json
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
nltk.download('all', force=True)
nltk.data.path.append("/Users/laurenceliao/nltk_data")

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/laurenceliao/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/laurenceliao/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/laurenceliao/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/laurenceliao/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /Users/laurenceliao/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package bcp47 to
[nltk_data]    |     /U

In [291]:
tech_data = pd.read_csv("tech_jobs_trainingset.csv")

tech_data

Unnamed: 0,advertiserurl,jobdescription,jobtitle,skills,tag,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,https://www.dice.com/jobs/detail/AUTOMATION-TE...,Looking for Selenium engineers...must have sol...,AUTOMATION TEST ENGINEER,SEE BELOW,automation,,Buckets:,tag: \n-automation\n-it\n-backend_dev...
1,https://www.dice.com/jobs/detail/Information-S...,The University of Chicago has a rapidly growin...,Information Security Engineer,"linux/unix, network monitoring, incident respo...",it,,DevOps\nPM (project management)\nTesting\nmain...,qa
2,https://www.dice.com/jobs/detail/Business-Solu...,"GalaxE.SolutionsEvery day, our solutions affec...",Business Solutions Architect,"Enterprise Solutions Architecture, business in...",architect,,,bi
3,https://www.dice.com/jobs/detail/Java-Develope...,Java DeveloperFull-time/direct-hireBolingbrook...,"Java Developer (mid level)- FT- GREAT culture,...",Please see job description,backend_dev,,,architect
4,https://www.dice.com/jobs/detail/DevOps-Engine...,Midtown based high tech firm has an immediate ...,DevOps Engineer,"Configuration Management, Developer, Linux, Ma...",devops,,,ui
...,...,...,...,...,...,...,...,...
998,https://www.dice.com/jobs/detail/Software-Arch...,Global financial firm whose sole business is i...,Software Architect for Elite Hedge Fund,"Software Architect, .Net, Cloud",architect,,,
999,https://www.dice.com/jobs/detail/Principal-Eng...,"Principal EngineerLocation: Boston, MA About Y...",Principal Engineer,"Agile, Analysis, Apache, Architecture, CASE, D...",,,,
1000,https://www.dice.com/jobs/detail/Oracle-Financ...,Position: Oracle Financial Specialist Locati...,Oracle Financial Specialist,"Oracle Financial,EBS,Oracle EBS,Oracle",,,,
1001,https://www.dice.com/jobs/detail/Strategy-Cons...,Are you passionate about delivering goals to y...,Strategy Consulting - Human Capital - HR Share...,"Consulting, Development, Excel, HTML, HTTP, Ma...",,,,


In [292]:
# Data Creation
tech_training = pd.DataFrame(tech_data[['jobtitle', 'jobdescription', 'tag']])

tech_training = tech_training.dropna()
tech_training = tech_training.reindex()


In [293]:
# Data Preprocessing
tech_training = tech_training.applymap(lambda x: x.lower() if isinstance(x, str) else x)
tech_training

old_word = r'\bit\b'

# This is to avoid the stopwords function from removing It since it reads it as "it"
tech_training['tag'] = tech_training['tag'].str.replace(old_word, 'information_technology', regex=True)



In [294]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Removes special characters
def remove_special_characters(text):
    pattern = r'[^a-zA-Z\s]'  
    cleaned_text = re.sub(pattern, '', text)
    
    return cleaned_text

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

# remove whitespace
def remove_whitespace(text):
    return " ".join(text.split())


# remove keyfault stopwords

def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return filtered_text

# Stemming 

stemmer = PorterStemmer()

def stem_words(text):
    word_tokens = word_tokenize(text)
    stems = [stemmer.stem(word) for word in word_tokens]
    return stems

# Lemmatization
lemmatizer = WordNetLemmatizer

def lemma_words(text):
    word_tokens = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in text]
    return lemmatized_words

tech_training = tech_training.applymap(lambda x: remove_special_characters(x) if isinstance(x, str) else x)
tech_training = tech_training.applymap(lambda x: remove_punctuation (x) if isinstance(x, str) else x)
tech_training = tech_training.applymap(lambda x: remove_whitespace(x) if isinstance(x, str) else x)
tech_training = tech_training.applymap(lambda x: remove_stopwords(x) if isinstance(x, str) else x)
tech_training = tech_training.applymap(lambda x: stem_words(x) if isinstance(x, str) else x)
tech_training = tech_training.applymap(lambda x: lemma_words(x) if isinstance(x, str) else x)




In [295]:
# Combining both job title and job description
tech_training['combined'] = tech_training['jobtitle'] + tech_training['jobdescription']
X = tech_training['combined'].apply(lambda x:x[0])




y = tech_training['tag'].apply(lambda x: x[0])


# Train Test Split (80% training data, 20% testing data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)



In [300]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize 
tfidf_vectorizer = TfidfVectorizer()

# Fit transform the vectorizer

X_train_Tfidf_df = tfidf_vectorizer.fit_transform(X_train).toarray()

X_train_Tfidf_df = pd.DataFrame(X_train_Tfidf_df)

X_test_Tfidf_df = tfidf_vectorizer.transform(X_test).toarray()

X_test_Tfidf_df = pd.DataFrame(X_test_Tfidf_df)

#tfidf_vectorizer.get_feature_names_out()
X_train_Tfidf_df




Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,123,124,125,126,127,128,129,130,131,132
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
387,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
388,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
389,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
390,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [297]:
model = GaussianNB()

model.fit(X_train_Tfidf_df, y_train)


GaussianNB()


In [298]:
y_pred = model.predict(X_test_Tfidf_df)

In [299]:
accuracy_score(y_test, y_pred)


0.2755102040816326

In [308]:
if y_pred.ndim == 1:
    y_pred = pd.Series(y_pred, index=y_test.index)
else:
    y_pred = pd.DataFrame(y_pred, index=y_test.index)

# Create DataFrame for checking
check = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
check

Unnamed: 0,Actual,Predicted
223,webdev,webdev
764,security,bi
491,software,backenddev
584,informationtechnology,networks
816,software,embedded
...,...,...
730,software,artchitect
153,databases,quality
855,databases,ui
546,backenddev,networks
