In [3]:
# Required Dependencies:

# pip install chromadb
# pip install langchain_groq
# pip install langchain_community

In [5]:
# Importing required modules
from langchain_groq import ChatGroq
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser

In [7]:
# Extracts the Groq API Key from '.env.local' file

import os                                                                                                                                                                                                          
from dotenv import load_dotenv, find_dotenv
from pathlib import Path
load_dotenv(Path(".env.local"))
KEY = os.getenv("GROQ_API_KEY")

## Testing the ChatGroq

In [10]:
# Testing if ChatGroq is running correctly

llm = ChatGroq(
    model= 'llama-3.3-70b-versatile',
    temperature= 0,
    groq_api_key= KEY,
    max_tokens= None,
    timeout= None,
    max_retries= 2,
    # Other Parameters...
)

answer = llm.invoke("Who is the richest man in the world of all time?")
print(answer.content)
print("\n ***Preambled Answer***\n")
answer = llm.invoke("Who is the richest man in the world of all time?, no preamble")
print(answer.content)

The richest man in the world of all time is a matter of debate among historians and economists, as the concept of wealth and its measurement have changed over time. However, according to various sources, including Forbes and other financial publications, the richest person in history is often considered to be:

1. **Mansa Musa I** (1280-1337): The king of the Mali Empire, who ruled over a vast territory in West Africa during the 14th century. His wealth is estimated to be around $400 billion in today's dollars, making him the richest person in history.

Mansa Musa's wealth came from his control of the trans-Saharan trade, which included gold, salt, and other valuable commodities. He was known for his extravagant spending and generosity, and his pilgrimage to Mecca in 1324 was said to have been so lavish that it caused a shortage of gold in Egypt.

Other contenders for the title of richest person in history include:

* **John D. Rockefeller** (1839-1937): The American oil industry magna

In [12]:
# Importing Chromadb

import chromadb
client = chromadb.Client()
collection = client.create_collection(name = "my_collection")

In [13]:
collection.add(
    
    documents = [
        "I like drinking water everyday",
        "I like my apple iphone",
        "Do you like apple?"
    ],

    ids = ["id1", "id2", "id3"],

    metadatas = [
        {"url": "source_to_id1"},
        {"url": "source_to_id2"},
        {"url": "source_to_id3"},
    ]
)

In [14]:
print(collection.get())

{'ids': ['id1', 'id2', 'id3'], 'embeddings': None, 'documents': ['I like drinking water everyday', 'I like my apple iphone', 'Do you like apple?'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [{'url': 'source_to_id1'}, {'url': 'source_to_id2'}, {'url': 'source_to_id3'}]}


In [18]:
print(collection.get(ids = ["id2"]))

{'ids': ['id2'], 'embeddings': None, 'documents': ['I like my apple iphone'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [{'url': 'source_to_id2'}]}


In [20]:
collection.query(query_texts = ["I like technology"], n_results = 2)

{'ids': [['id2', 'id3']],
 'embeddings': None,
 'documents': [['I like my apple iphone', 'Do you like apple?']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[{'url': 'source_to_id2'}, {'url': 'source_to_id3'}]],
 'distances': [[1.0667046308517456, 1.2515640258789062]]}

In [22]:
collection.query(query_texts = ["Do you eat healthy?"], n_results = 1)

{'ids': [['id1']],
 'embeddings': None,
 'documents': [['I like drinking water everyday']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[{'url': 'source_to_id1'}]],
 'distances': [[1.3786277770996094]]}

## Extracting Data From Job Description

In [25]:
# Using the WebBaseLoader to scrape content of a webpage. Using a demo job description link for testing purposes
loader = WebBaseLoader("https://jobs.foundever.com/job/Canada-IT-Site-System-Supt-Admin-Onsite-in-London%2C-ON-Cana/1286688600/?")

# Stores the loaded data of the webpage
data = loader.load().pop().page_content

In [27]:
# Defining the LLM that is going to be used for data extraction

llm = ChatGroq(
    model= 'llama-3.3-70b-versatile',
    temperature= 0,
    groq_api_key= KEY,
    max_tokens= None,
    timeout= None,
    max_retries= 2,
    # Other Parameters...
)

In [29]:
# Using prompt on the model

extract_from_prompt = PromptTemplate.from_template(
    """
    I will give you scraped text from job postings. Your job is to extract the details and requirements in a JSON format containing the following keywords: 'role', 'experience', 'skills', 'description'
    Only return the JSON. No preamble please.
    Here is the scraped text: {page_data}
    """
)

chain_extract = extract_from_prompt | llm
answer = chain_extract.invoke(input= {'page_data': data})
print(answer.content)

```json
{
    "role": "IT Site System Supt Admin",
    "experience": "5 years of experience in similar capacity preferred",
    "skills": [
        "In-depth knowledge of Windows Operating Systems (PC and Server) and Microsoft Office",
        "Extensive LAN and WAN networking skills",
        "Tape backup and restore procedures",
        "Excellent troubleshooting and customer service skills",
        "Ability to multi-task and take initiatives with minimum direction",
        "Ability to manage and direct others; experience with large enterprise environments"
    ],
    "description": "Technical manager for the site and fully responsible for its functionality, maintenance, software applications, hardware and the security and recovery of all data."
}
```


In [31]:
# Converting the output into JSON using parser

parser = JsonOutputParser()
json_answer = parser.parse(answer.content)
json_answer

{'role': 'IT Site System Supt Admin',
 'experience': '5 years of experience in similar capacity preferred',
 'skills': ['In-depth knowledge of Windows Operating Systems (PC and Server) and Microsoft Office',
  'Extensive LAN and WAN networking skills',
  'Tape backup and restore procedures',
  'Excellent troubleshooting and customer service skills',
  'Ability to multi-task and take initiatives with minimum direction',
  'Ability to manage and direct others; experience with large enterprise environments'],
 'description': 'Technical manager for the site and fully responsible for its functionality, maintenance, software applications, hardware and the security and recovery of all data.'}

In [33]:
# Checking the type of output for confirmation
print(type(json_answer))

<class 'dict'>


## Importing Portfolio

In [36]:
import csv

In [50]:
# Function reads and formats the data in portfolio so that the LLM can understand

def read_file(path):
    current = []
    with open(path, 'r') as file:
        csv_reader = csv.reader(file)
        next(csv_reader)
        for line in csv_reader:
            # Formatting data
            skills = tuple(line[:-1])
            link = line[-1]
            current.append((skills, link))
        return current

In [64]:
# Sample Usage

path = 'demo_portfolio.csv'
data = read_file(path)


for skills, link in data:
    print(skills, link)

('Python', ' SQL', ' Pandas')  https://github.com/user/project1
('SQL', ' Python', ' Airflow')  https://github.com/user/project2
('PySpark', ' Spark SQL', ' Delta Lake')  https://github.com/user/project3
('Machine Learning', ' Deep Learning', ' TensorFlow')  https://github.com/user/project4
('Data Engineering', ' ETL', ' ELT')  https://github.com/user/project5
('Cloud Platforms (AWS', ' GCP', ' Azure)')  https://github.com/user/project6
('Data Warehousing', ' Data Modeling', ' DBT')  https://github.com/user/project7
('Data Visualization', ' Power BI', ' Tableau')  https://github.com/user/project8
('MLOps', ' MLflow', ' Kubeflow')  https://github.com/user/project9
('Natural Language Processing (NLP)', ' NLTK', ' spaCy')  https://github.com/user/project10
('Computer Vision', ' OpenCV', ' TensorFlow')  https://github.com/user/project11
('Time Series Analysis', ' Forecasting', ' Prophet')  https://github.com/user/project12
('Data Cleaning', ' Data Wrangling', ' Pandas')  https://github.com