## Part A: Data Preprocessing, Model Training, and Storing the Model as a Pickle File

### Data Preprocessing: 
Preprocess the text columns (title, description, skills_desc) in the dataset.
### TF-IDF Vectorization: 
Perform TF-IDF vectorization on the preprocessed text data.
### Cosine Similarity Calculation: 
Store the cosine similarity matrix as a h5py file, so it can be used later for making recommendations.

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
import numpy as np
import nltk
import pickle
import h5py
import os

In [None]:
nltk.download('wordnet')
nltk.download('stopwords')

In [None]:
# Global variable for TF-IDF Vectorizer and Cosine Similarity Matrix

tfidf_vectorizer = TfidfVectorizer(stop_words="english", max_features=10000)  # Limit the number of features
cosine_sim_matrix = None

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))


In [None]:
def preprocess_text(text):
    # Remove non-alphanumeric characters and lowercase the text
    text = re.sub(r'\W', ' ', text.lower())
    # Tokenize and lemmatize
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    return ' '.join(words)

In [None]:
# def process_and_store_model(data_path="C:/Users/DELL/Linkedin-Job-Market-Analysis-using-ML/LinkedIn Scraper/job postings 2023 24/postings.csv"):
#     # Load data
#     data = pd.read_csv(data_path)
#     new_directory = "D:/ARJYAHI/Models"
#     os.makedirs(new_directory, exist_ok=True)
    
#     # Combine the relevant text columns into a single string for each job
#     data["combined_text"] = data["title"].fillna('') + ' ' + data["description"].fillna('') + ' ' + data["skills_desc"].fillna('')
#     data["combined_text"] = data["combined_text"].apply(preprocess_text)
    
#     # Perform TF-IDF vectorization
#     tfidf_matrix = tfidf_vectorizer.fit_transform(data["combined_text"])
    
#     # Calculate cosine similarity between all jobs and keep it sparse
#     global cosine_sim_matrix
#     cosine_sim_matrix = cosine_similarity(tfidf_matrix, dense_output=False)  # Keep it sparse
    
#     # Store the TF-IDF vectorizer and the sparse cosine similarity matrix in an HDF5 file
#     h5_file_path = os.path.join(new_directory, 'model_data.h5')
#     with h5py.File(h5_file_path, 'w') as h5f:
#         # Save the TF-IDF matrix (sparse matrix) as a dense dataset
#         h5f.create_dataset('tfidf_matrix', data=tfidf_matrix.toarray(), compression='gzip')
        
#         # Save the cosine similarity matrix as a dense dataset
#         h5f.create_dataset('cosine_sim_matrix', data=cosine_sim_matrix, compression='gzip')
        
#         # Save the pickled TF-IDF vectorizer in the HDF5 file as a serialized object
#         tfidf_vectorizer_pickle = pickle.dumps(tfidf_vectorizer)
#         h5f.create_dataset('tfidf_vectorizer', data=np.void(tfidf_vectorizer_pickle))

#     print("Model and cosine similarity matrix have been stored successfully in HDF5 format.")

def process_and_store_model(data_path= 'C:/Users/DELL/Linkedin-Job-Market-Analysis-using-ML/LinkedIn Scraper/job postings 2023 24/postings.csv' ):
    # Load job postings data
    data = pd.read_csv(data_path)
    
    # Combine the relevant text columns into a single string for each job
    data["combined_text"] = data["title"].fillna('') + ' ' + data["description"].fillna('') + ' ' + data["skills_desc"].fillna('')
    data["combined_text"] = data["combined_text"].apply(preprocess_text)
    
    # Perform TF-IDF vectorization
    tfidf_matrix = tfidf_vectorizer.fit_transform(data["combined_text"])
    
    # Use Approximate Nearest Neighbors with cosine similarity metric
    nbrs = NearestNeighbors(n_neighbors=10, metric='cosine', algorithm='brute').fit(tfidf_matrix)

    # Save the model and TF-IDF vectorizer to an HDF5 file
    model_directory = "D:/ARJYAHI/Models"
    os.makedirs(model_directory, exist_ok=True)
    h5_file_path = os.path.join(model_directory, 'model_data.h5')
    
    with h5py.File(h5_file_path, 'w') as h5f:
        # Serialize the TF-IDF vectorizer with pickle and store in HDF5
        tfidf_vectorizer_pickle = pickle.dumps(tfidf_vectorizer)
        h5f.create_dataset('tfidf_vectorizer', data=np.void(tfidf_vectorizer_pickle))
        # Save the NearestNeighbors model directly (or save parameters if large)
        nbrs_pickle = pickle.dumps(nbrs)
        h5f.create_dataset('nbrs', data=np.void(nbrs_pickle))

In [None]:
process_and_store_model()

## Part B: Loading the Files and Using the Model for Recommendations
In Part B, we load the files and use them to make recommendations based on user input.

In [None]:
import pandas as pd
import pickle
from sklearn.metrics.pairwise import cosine_similarity
import re
import h5py
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [None]:
# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

In [None]:
# Define a function to preprocess and lemmatize text
def preprocess_text(text):
    # Remove non-alphanumeric characters and lowercase the text
    text = re.sub(r'\W', ' ', text.lower())
    # Tokenize and lemmatize
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    return ' '.join(words)

In [None]:
# def load_model_and_recommend(title=None, skills=None, top_n=10):
#     # Define the model directory and load the HDF5 file
#     model_directory = "D:/ARJYAHI/Models"
#     h5_file_path = os.path.join(model_directory, 'model_data.h5')
    
#     # Open the HDF5 file and load the TF-IDF vectorizer and cosine similarity matrix
#     with h5py.File(h5_file_path, 'r') as h5f:
#         # Load the TF-IDF vectorizer from the serialized pickle data
#         tfidf_vectorizer_pickle = h5f['tfidf_vectorizer'][()]
#         tfidf_vectorizer = pickle.loads(tfidf_vectorizer_pickle.tobytes())
        
#         # Load the cosine similarity matrix
#         cosine_sim_matrix = h5f['cosine_sim_matrix'][:]

#     # Ensure at least one input is provided
#     if not title and not skills:
#         raise ValueError("Please provide at least a title or skills to get recommendations.")
    
#     # Combine title and skills, if provided, into one input text
#     input_text = ''
#     if title:
#         input_text += title
#     if skills:
#         input_text += ' ' + skills
#     input_text = preprocess_text(input_text)  # Only preprocess the user input here
    
#     # Transform the input text using the loaded TF-IDF vectorizer
#     input_tfidf = tfidf_vectorizer.transform([input_text])
    
#     # Calculate cosine similarity between the input and all job postings
#     cosine_sim = cosine_similarity(input_tfidf, cosine_sim_matrix)
    
#     # Load the job data (same dataset as during training)
#     data = pd.read_csv("C:/Users/DELL/Linkedin-Job-Market-Analysis-using-ML/LinkedIn Scraper/job postings 2023 24/postings.csv")
    
#     # Get similarity scores for the input
#     sim_scores = list(enumerate(cosine_sim[0]))
#     # Sort by similarity score in descending order
#     sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
#     # Get the indices of the top_n most similar jobs
#     sim_indices = [i[0] for i in sim_scores[:top_n]]
    
#     # Return the top_n similar jobs
#     return data.iloc[sim_indices][["job_id", "company_name", "title", "description", "skills_desc", "location"]]

def load_model_and_recommend(title=None, skills=None, top_n=10):
    model_directory = "D:/ARJYAHI/Models"
    h5_file_path = os.path.join(model_directory, 'model_data.h5')
    
    with h5py.File(h5_file_path, 'r') as h5f:
        # Load the TF-IDF vectorizer and NearestNeighbors model from HDF5
        tfidf_vectorizer = pickle.loads(h5f['tfidf_vectorizer'][()].tobytes())
        nbrs = pickle.loads(h5f['nbrs'][()].tobytes())
    
    # Combine title and skills into input text and preprocess
    input_text = ' '.join(filter(None, [title, skills]))
    input_text = preprocess_text(input_text)
    
    # Transform the input text using the loaded TF-IDF vectorizer
    input_tfidf = tfidf_vectorizer.transform([input_text])
    
    # Find nearest neighbors (most similar jobs)
    distances, indices = nbrs.kneighbors(input_tfidf, n_neighbors=top_n)
    
    # Load the job postings dataset
    data = pd.read_csv("C:/Users/DELL/Linkedin-Job-Market-Analysis-using-ML/LinkedIn Scraper/job postings 2023 24/postings.csv")
    
    # Return the top_n similar jobs
    return data.iloc[indices[0]][["job_id", "company_name", "title", "description", "skills_desc", "location"]]


In [None]:
title_input = "Data Scientist"  # Or set it to None if only using skills
skills_input = None  # Or set it to a skills string if only using skills
similar_jobs = load_model_and_recommend(title=title_input, skills=skills_input)
print(similar_jobs)