In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import re
from sklearn.preprocessing import OneHotEncoder, LabelEncoder,normalize
from sklearn.feature_selection import SelectFromModel
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data

In [100]:
# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dhia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dhia\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [74]:
df = pd.read_excel('combined_data.xlsx')

In [75]:
df = pd.read_csv('combined_csv_files.csv',sep=',')

In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299729 entries, 0 to 299728
Data columns (total 17 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   job_url           299729 non-null  object 
 1   site              299729 non-null  object 
 2   title             299729 non-null  object 
 3   company           299728 non-null  object 
 4   company_url       214473 non-null  object 
 5   location          293647 non-null  object 
 6   job_type          41840 non-null   object 
 7   date_posted       290623 non-null  object 
 8   interval          58583 non-null   object 
 9   min_amount        73970 non-null   float64
 10  max_amount        73970 non-null   float64
 11  currency          74753 non-null   object 
 12  is_remote         61292 non-null   object 
 13  num_urgent_words  143664 non-null  float64
 14  benefits          126224 non-null  object 
 15  emails            10747 non-null   object 
 16  description       14

In [78]:
df.count()

job_url             299729
site                299729
title               299729
company             299728
company_url         214473
location            293647
job_type             41840
date_posted         290623
interval             58583
min_amount           73970
max_amount           73970
currency             74753
is_remote            61292
num_urgent_words    143664
benefits            126224
emails               10747
description         143664
dtype: int64

In [79]:
df.describe()

Unnamed: 0,min_amount,max_amount,num_urgent_words
count,73970.0,73970.0,143664.0
mean,44931.790164,58411.25,0.039648
std,46913.513532,65210.55,0.218394
min,0.0,-1.0,0.0
25%,70.0,85.0,0.0
50%,40000.0,50000.0,0.0
75%,65779.54,85000.0,0.0
max,500000.0,1400000.0,5.0


In [80]:
df.isnull().sum()

job_url                  0
site                     0
title                    0
company                  1
company_url          85256
location              6082
job_type            257889
date_posted           9106
interval            241146
min_amount          225759
max_amount          225759
currency            224976
is_remote           238437
num_urgent_words    156065
benefits            173505
emails              288982
description         156065
dtype: int64

In [81]:
filtered_df = df[df['description'].notnull()]

In [82]:
filtered_df.isna().sum()

job_url                  0
site                     0
title                    0
company                  0
company_url          85254
location                 0
job_type            101833
date_posted              0
interval             85082
min_amount           85866
max_amount           85866
currency             85083
is_remote            82382
num_urgent_words         0
benefits            143664
emails              132917
description              0
dtype: int64

In [83]:
numerical_cols = ['min_amount', 'max_amount', 'num_urgent_words']
for col in numerical_cols:
    filtered_df[col].fillna(filtered_df[col].median(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[col].fillna(filtered_df[col].median(), inplace=True)


In [84]:
# Step 2: Exclude sparse columns ('emails', 'benefits')
columns_to_exclude = ['emails', 'benefits','interval','currency','company_url']
combined_data_cleaned = filtered_df.drop(columns=columns_to_exclude)

In [85]:
combined_data_cleaned.isna().sum()

job_url                  0
site                     0
title                    0
company                  0
location                 0
job_type            101833
date_posted              0
min_amount               0
max_amount               0
is_remote            82382
num_urgent_words         0
description              0
dtype: int64

In [86]:
combined_data_cleaned['job_type'].fillna('No job_type', inplace=True)
combined_data_cleaned['is_remote'].fillna('False', inplace=True)

In [87]:
combined_data_cleaned.n().sum()

job_url             0
site                0
title               0
company             0
location            0
job_type            0
date_posted         0
min_amount          0
max_amount          0
is_remote           0
num_urgent_words    0
description         0
dtype: int64

In [88]:
combined_data_cleaned.count()

job_url             143664
site                143664
title               143664
company             143664
location            143664
job_type            143664
date_posted         143664
min_amount          143664
max_amount          143664
is_remote           143664
num_urgent_words    143664
description         143664
dtype: int64

In [89]:
jobs_df = combined_data_cleaned

# Simplified text preprocessing


In [90]:
def preprocess_text_simple(text):
    # Lowercasing
    text = text.lower()
    # Removing non-alphabet characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize text
    words = text.split()
    # Remove stopwords and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Apply simplified preprocessing to the job description


In [91]:
jobs_df['description_processed'] = jobs_df['description'].apply(preprocess_text_simple)

# Vectorize the text using TF-IDF


In [92]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X = tfidf_vectorizer.fit_transform(jobs_df['description_processed'])

# Normalize the feature vectors to unit norm


In [93]:
X_normalized = normalize(X)

# Apply K-means clustering


In [101]:
# Apply K-means clustering
from sklearn.metrics import silhouette_score

# Example: Trying different numbers of clusters and computing silhouette scores
scores = []
range_n_clusters = list(range(2, 11))  # Example: from 2 to 10 clusters

for n_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(X_normalized)
    silhouette_avg = silhouette_score(X_normalized, kmeans.labels_)
    scores.append(silhouette_avg)
    print(f"Clusters: {n_clusters}, Silhouette Score: {silhouette_avg}")

  super()._check_params_vs_input(X, default_n_init=10)


Clusters: 2, Silhouette Score: 0.014056621772652681


  super()._check_params_vs_input(X, default_n_init=10)


Clusters: 3, Silhouette Score: 0.018153825340085944


  super()._check_params_vs_input(X, default_n_init=10)


Clusters: 4, Silhouette Score: 0.022667436014716456


  super()._check_params_vs_input(X, default_n_init=10)


Clusters: 5, Silhouette Score: 0.015798890573895323


  super()._check_params_vs_input(X, default_n_init=10)


Clusters: 6, Silhouette Score: 0.02214560075129343


  super()._check_params_vs_input(X, default_n_init=10)


Clusters: 7, Silhouette Score: 0.02736099979931349


  super()._check_params_vs_input(X, default_n_init=10)


Clusters: 8, Silhouette Score: 0.029328230601322666


  super()._check_params_vs_input(X, default_n_init=10)


Clusters: 9, Silhouette Score: 0.03550524959015533


  super()._check_params_vs_input(X, default_n_init=10)


Clusters: 10, Silhouette Score: 0.035913790302533305


In [94]:
num_clusters = 8  # This is an arbitrary choice; adjust based on exploratory analysis
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X_normalized)

  super()._check_params_vs_input(X, default_n_init=10)


# Assign cluster labels to each job posting


In [95]:
jobs_df['cluster'] = kmeans.labels_

# Prepare to explore the clusters


In [96]:
# Example: Analyze a specific cluster
suspect_cluster_index = 0  # Assuming cluster 0 is suspicious based on your analysis
suspect_jobs = jobs_df[jobs_df['cluster'] == suspect_cluster_index]

print("Suspect Cluster Job Titles:")
print(suspect_jobs['title'].value_counts().head(10))

Suspect Cluster Job Titles:
title
Medical Assistant Certified                     486
Equipment Maintenance Tech Support Assistant    448
Project Engineer                                324
Cash Reconciliation Analyst                     302
Engineer, Projects                              244
Operations Administrator                        243
Controls Engineer - $85K to $100K salary        215
BAS Applications Engineer                       200
MRI Technologist                                193
WA - MRI Technologist - $120 - Days             189
Name: count, dtype: int64


## Evaluate Cluster Quality


In [97]:
def print_top_terms_per_cluster(tfidf_vectorizer, kmeans_model, n_terms=10):
    # Get the feature names from the TF-IDF vectorizer
    terms = tfidf_vectorizer.get_feature_names_out()
    # Get the centroids of the clusters
    centroids = kmeans_model.cluster_centers_
    
    for i, centroid in enumerate(centroids):
        print(f"Cluster {i}:")
        # Sort the features (terms) in the centroid in descending order of importance
        top_terms_indices = centroid.argsort()[-n_terms:][::-1]
        top_terms = [terms[ind] for ind in top_terms_indices]
        print(", ".join(top_terms))

# Assuming you have your tfidf_vectorizer and kmeans model ready
print_top_terms_per_cluster(tfidf_vectorizer, kmeans, n_terms=10)

Cluster 0:
project, work, engineering, equipment, design, ability, required, experience, customer, system
Cluster 1:
equipment, sse, pb, learning, handling, fee, operate, material, asset, paid
Cluster 2:
network, system, computer, cyber, course, information, defense, experience, engineering, status
Cluster 3:
software, engineering, engineer, computer, experience, development, bachelor, electrical, java, system
Cluster 4:
system, experience, quality, management, solution, network, administer, design, administrator, software
Cluster 5:
security, information, experience, network, system, risk, cyber, year, incident, cybersecurity
Cluster 6:
data, business, analytics, le, analyst, ai, database, sql, analysis, experience
Cluster 7:
rpa, automation, rate, hiring, ndash, soil, protection, process, pay, capture
Cluster 8:
team, experience, working, work, business, service, role, support, development, skill
Cluster 9:
developer, software, web, team, experience, workforce, role, job, development

# Evaluate Topic Relevance


In [98]:
from sklearn.decomposition import LatentDirichletAllocation

# Assuming you've already vectorized your text data with TF-IDF and want to apply LDA on it
lda = LatentDirichletAllocation(n_components=num_clusters, random_state=42)
lda.fit(X_normalized)

def print_top_words_per_topic(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        print(" ".join(top_features))

print_top_words_per_topic(lda, tfidf_vectorizer.get_feature_names_out(), 10)

Topic 0:
software developer automation experience test team development solution web technology
Topic 1:
software developer office le looking work role working team type
Topic 2:
data pb power sse business bi analytics system asset performs
Topic 3:
security brbr data game business financial work threat intelligence system
Topic 4:
software experience ppmiddot regarding solution year knowledge client security information
Topic 5:
experience software system server handson management devops best administrator qa
Topic 6:
equipment system engineering data electrical design software perform operation engineer
Topic 7:
security network system information computer status experience course cyber employee
Topic 8:
security engineering experience information year management system risk support incident
Topic 9:
team work ncr working br support business experience role service


### check score correlation 

To do 

#### check data from kaggle to test check desc => 1000 +scraping data to eval model score 

## Modeling 

In [68]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Example pipeline setup (simplified)
text_features = 'description_processed'  # Assuming this is your text column
numeric_features = ['min_amount', 'max_amount', 'num_urgent_words']  # Example numeric feature
categorical_features = ['location', 'job_type']  # Example categorical features

preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(), text_features),
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])


In [69]:
# Example model (this could be a clusterer or classifier, depending on the stage)
model = KMeans(n_clusters=5, random_state=42)

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

# Fit the pipeline with your data
pipeline.fit(jobs_df)



  super()._check_params_vs_input(X, default_n_init=10)


In [71]:
# Predict or transform new data
# Example new job postings
new_jobs = [
    {"description": "Senior Data Scientist, NYC, high salary.", "salary_range": 150000, "location": "NYC", "job_type": "Full-time"},
    {"description": "Entry-level Software Developer, remote work available.", "salary_range": 80000, "location": "Remote", "job_type": "Full-time"}
]

In [72]:
new_data = pd.DataFrame(new_jobs)
