In [None]:
import xml.etree.ElementTree as ET
import json

def parse_xml(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    jobs_list = []

    for job in root.findall('job'):
        job_dict = {
            "title": job.find('title').text,
            "company": job.find('company').text,
            "posted_date": job.find('posted_date').text,
            "job_reference": job.find('job_reference').text,
            "req_number": job.find('req_number').text,
            "url": job.find('url').text,
            "body": job.find('body').text,
            "city": job.find('city').text,
            "state": job.find('state').text,
            "country": job.find('country').text,
            "location": job.find('location').text,
            "function": job.find('function').text,
            "logo": job.find('logo').text,
            "jobtype": job.find('jobtype').text,
            "education": job.find('education').text,
            "experience": job.find('experience').text,
            "salary": job.find('salary').text,
            "requiredlanguages": job.find('requiredlanguages').text,
            "requiredskills": job.find('requiredskills').text
        }
        jobs_list.append(job_dict)
        
    return jobs_list

xml_file = 'job_sample.xml' 
jobs_list = parse_xml(xml_file)

json_output = json.dumps(jobs_list, indent=4)

with open('job_openings.json', 'w') as json_file:
    json_file.write(json_output)

In [10]:
import pandas as pd

# Replace 'path/to/job_openings.json' with the actual path to your JSON file
df = pd.read_json('job_openings.json')

# number of rows
print(df.shape[0])

# Do something with the DataFrame
print(df.head())

1000
                                      title         company posted_date  \
0                             Shift Manager   Buffalo Wings     3/10/23   
1                       Access Provisioning             CTG      9/7/22   
2                             Shift Manager   Buffalo Wings     3/10/23   
3                                Dishwasher  Cracker Barrel     4/28/22   
4  DIRECTOR OF LABOR RELATIONS (NJ, PA, NY)   Compass Group     1/28/23   

   job_reference  req_number  \
0       10794429        3036   
1        8625904        3493   
2       10798555        3036   
3        6989039        4946   
4       10251570        3398   

                                                 url  \
0  https://www.localjobs.com/job/kissimmee-fl-shi...   
1  https://www.localjobs.com/job/san-diego-ca-acc...   
2  https://www.localjobs.com/job/bristol-va-shift...   
3  https://www.localjobs.com/job/kissimmee-fl-dis...   
4  https://www.localjobs.com/job/charlotte-nc-dir...   

              

In [9]:
# Get the number of missing values for each column
missing_values = df.isnull().sum()

# Print the results
print(missing_values)

title                   0
company                 0
posted_date             0
job_reference           0
req_number              0
url                     0
body                    0
city                   27
state                  13
country                 4
location                0
function                5
logo                    0
jobtype                 0
education            1000
experience           1000
salary               1000
requiredlanguages    1000
requiredskills       1000
dtype: int64


In [None]:

from langchain.embeddings import HuggingFaceEmbeddings
import json
from langchain.vectorstores import FAISS
import faiss
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS

with open('../job_openings.json', 'r') as f:
    data = json.load(f)

embeddings = HuggingFaceEmbeddings()
# Loop over the objects and create a string for each one
strings = []
for obj in data:
    string = json.dumps(obj)

    strings.append(string)

doc_result = embeddings.embed_documents(strings)



In [12]:
index = faiss.index_factory(len(doc_result[0]), "Flat")
index.train(doc_result)
index.add(doc_result)

In [14]:
import numpy as np
query_result = embeddings.embed_query("software engineer")
distances, neighbors = index.search(query_result.reshape(1,-1).astype(np.float32), k=5)

In [16]:
for neighbor in neighbors[0]:
    print(strings[neighbor])

{"title": "Application Developer - Junior", "company": "CTG", "posted_date": "4/5/23", "job_reference": "11052592", "req_number": "3493", "url": "https://www.localjobs.com/job/indianapolis-in-application-developer-junior", "body": "<p>Application Developer - Junior</p>\n<p>United States</p>\n<p>Information Technology</p>\n<p>Apr 05, 2023Post Date</p>\n<p>23200577Requisition #</p>\n<p>Apply for JobShare this JobSign Up for Job Alerts</p>\n<p>Also known as: System Engineer, Programmer, System Analyst</p>\n<p>TYPICAL WORK PERFORMED:</p>\n<p>> Designs or assists in the design of applications including mobile applications</p>\n<p>> Creates, and/or modifies existing applications</p>\n<p>> Plans, prepares & analyzes unit tests to detect technical or logic errors</p>\n<p>> Develops test data. Tests modules and analyzes results. Verifies validity of new or modified systems.</p>\n<p>> Analyzes tasks and provides accurate estimates for level of effort required to complete</p>\n<p>> Assists, revie

In [None]:
from faiss import write_index, read_index
write_index(index, "large.index")
index = read_index("large.index")

In [18]:
import pickle
import os

from langchain.embeddings.base import Embeddings

from dotenv import load_dotenv

load_dotenv()  # Load environment variables from .env file


def save_embeddings(
    embeddings: Embeddings,
    saving_embeddings_file_name: str = os.getenv("SAVING_EMBEDDINGS_FILE_NAME"),
    saving_embeddings_directory: str = os.getenv("SAVING_EMBEDDINGS_DIRECTORY"),
) -> None:
    """
    Save embeddings to a binary file with the specified file name and directory path.

    Args:
        - embeddings (Embeddings): The embeddings to be saved.
        - saving_embeddings_file_name (str): The name of the file to save the embeddings to.
        - saving_embeddings_directory (str): The path to the directory where the file will be saved.

    Returns:
        - None
    """

    directory = os.path.join(os.getcwd(), saving_embeddings_directory)
    if not os.path.exists(directory):
        os.makedirs(directory)
    file_path = os.path.join(directory, saving_embeddings_file_name + ".pkl")

    # Save embeddings to binary file
    with open(file_path, "wb") as f:
        pickle.dump(embeddings, f)

In [20]:
save_embeddings(doc_result, saving_embeddings_file_name="saved_embeddings", saving_embeddings_directory="../embeddings")

In [23]:
def load_embeddings(embeddings_path) -> Embeddings:
    """
    Loads embeddings from the specified file path using pickle.

    Args:
        - embeddings_path (str): Path to file containing embeddings.

    Returns:
        - Embeddings: Loaded embeddings.
    """

    with open(embeddings_path, "rb") as f:
        embeddings: Embeddings = pickle.load(f)

    return embeddings

In [24]:
loaded_embeddings = load_embeddings("../embeddings/saved_embeddings.pkl")