In [3]:
# Required Dependencies:

# pip install chromadb
# pip install langchain_groq
# pip install langchain_community

In [5]:
# Importing required modules
from langchain_groq import ChatGroq
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
import csv
import uuid
import chromadb
import json

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [6]:
# Extracts the Groq API Key from '.env.local' file

import os                                                                                                                                                                                                          
from dotenv import load_dotenv, find_dotenv
from pathlib import Path
load_dotenv(Path(".env.local"))
KEY = os.getenv("GROQ_API_KEY")

## Testing the ChatGroq

In [10]:
# Testing if ChatGroq is running correctly

llm = ChatGroq(
    # model= 'llama-3.3-70b-versatile',
    model= 'gemma2-9b-it',
    temperature= 0,
    groq_api_key= KEY,
    max_tokens= None,
    timeout= None,
    max_retries= 2,
    # Other Parameters...
)

answer = llm.invoke("Who is the richest man in the world of all time?")
print(answer.content)
print("\n ***Preambled Answer***\n")
answer = llm.invoke("Who is the richest man in the world of all time?, no preamble")
print(answer.content)

It's impossible to say definitively who the richest person in history is. 

Here's why:

* **Lack of Consistent Data:**  Reliable wealth records simply don't exist for most of history. We can't accurately compare the wealth of someone who lived in ancient Rome to someone who lived in the 20th century.
* **Different Economies:**  The value of money changes drastically over time due to inflation, currency fluctuations, and changes in economic systems. 
* **Defining "Wealth":**  Wealth isn't just about money. It can include land, resources, assets, and even influence and power.  

**Some contenders often mentioned in discussions of the richest people in history include:**

* **Mansa Musa:** The 14th-century King of Mali, whose wealth from gold and salt mines was legendary.
* **Augustus Caesar:** The first Roman Emperor, who controlled vast territories and resources.
* **Jacob Fugger:** A 16th-century German merchant and banker who financed the Holy Roman Empire.
* **John D. Rockefeller:**

In [12]:
# Chromadb Setup

client = chromadb.Client()
collection = client.create_collection(name = "my_collection")

In [14]:
collection.add(
    
    documents = [
        "I like drinking water everyday",
        "I like my apple iphone",
        "Do you like apple?"
    ],

    ids = ["id1", "id2", "id3"],

    metadatas = [
        {"url": "source_to_id1"},
        {"url": "source_to_id2"},
        {"url": "source_to_id3"},
    ]
)

In [15]:
print(collection.get())

{'ids': ['id1', 'id2', 'id3'], 'embeddings': None, 'documents': ['I like drinking water everyday', 'I like my apple iphone', 'Do you like apple?'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [{'url': 'source_to_id1'}, {'url': 'source_to_id2'}, {'url': 'source_to_id3'}]}


In [18]:
print(collection.get(ids = ["id2"]))

{'ids': ['id2'], 'embeddings': None, 'documents': ['I like my apple iphone'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [{'url': 'source_to_id2'}]}


In [20]:
collection.query(query_texts = ["I like technology"], n_results = 2)

{'ids': [['id2', 'id3']],
 'embeddings': None,
 'documents': [['I like my apple iphone', 'Do you like apple?']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[{'url': 'source_to_id2'}, {'url': 'source_to_id3'}]],
 'distances': [[1.0667046308517456, 1.2515640258789062]]}

In [22]:
collection.query(query_texts = ["Do you eat healthy?"], n_results = 1)

{'ids': [['id1']],
 'embeddings': None,
 'documents': [['I like drinking water everyday']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[{'url': 'source_to_id1'}]],
 'distances': [[1.3786277770996094]]}

## Extracting Data From Job Description

In [25]:
# Using the WebBaseLoader to scrape content of a webpage. Using a demo job description link for testing purposes
loader = WebBaseLoader("https://amazon.jobs/en/jobs/2729754/cloud-support-intern")

# Stores the loaded data of the webpage
data = loader.load().pop().page_content

In [27]:
# Defining the LLM that is going to be used for data extraction

llm = ChatGroq(
    model= 'llama-3.3-70b-versatile',
    # model= 'gemma2-9b-it',
    temperature= 0.5,
    groq_api_key= KEY,
    max_tokens= None,
    timeout= None,
    max_retries= 2,
    # Other Parameters...
)

In [29]:
# Using prompt on the model

extract_from_prompt = PromptTemplate.from_template(
    """
    I will give you scraped text from job postings. Your job is to extract the details and requirements in a JSON format containing the following keywords: 'role', 'experience', 'skills', 'description'
    Only return the JSON. No preamble please.
    Here is the scraped text: {page_data}
    """
)

chain_extract = extract_from_prompt | llm
answer = chain_extract.invoke(input= {'page_data': data})
print(answer.content)
print(type(answer.content))

```
{
  "role": "Cloud Support Intern",
  "experience": "Internship",
  "skills": [
    "Cloud computing",
    "Database, Big Data, Analytics",
    "Networking (DNS, TCP/IP, HTTP, VLAN, etc.)",
    "OS (Linux and/or Windows Servers)",
    "Virtualization (VMware, Xen, Hypervisor)",
    "Security concepts / best practices",
    "Storage and Content Delivery",
    "Deployment",
    "Developer & Mobile Services (Serverless, Web Mobile, IoT)",
    "Programming / scripting experience (Java, Perl, Ruby, C#, and/or PHP)",
    "Korean/English Business Communication Skills"
  ],
  "description": "Amazon Web Services (AWS) is seeking a Cloud Support Intern to join their team in Korea. The successful candidate will have a strong focus on customer support, technical troubleshooting, and cloud computing. They will work with leading companies and internal development teams to resolve complex technical issues and drive customer interactions. The intern will also have the opportunity to learn and deve

### Cleaning the JSON (For Gemini)

In [96]:
raw_output = answer.content

# Remove triple backticks and 'json' if present
cleaned_output = raw_output.strip('`json').strip('`')

# Load the string as a dictionary
data = json.loads(cleaned_output)

# Clean up the fields
for key in ['role', 'experience', 'skills', 'description']:
    if isinstance(data.get(key), str):
        data[key] = data[key].replace('\n', ' ')
    elif isinstance(data.get(key), list):
        data[key] = [item.replace('\n', ' ') for item in data[key]]

# Now 'data' is your cleaned JSON object
cleaned_answer = json.dumps(data, indent=2)
print(cleaned_answer)
print(type(cleaned_answer))

{
  "role": "Data Engineer/Scientist",
  "experience": "Not specified",
  "skills": [
    "Python",
    "SQL",
    "Pandas",
    "Airflow",
    "PySpark",
    "Spark SQL",
    "Delta Lake",
    "Machine Learning",
    "Deep Learning",
    "TensorFlow",
    "Data Engineering",
    "ETL",
    "ELT",
    "Cloud Platforms (AWS, GCP, Azure)",
    "Data Warehousing",
    "Data Modeling",
    "DBT",
    "Data Visualization",
    "Power BI",
    "Tableau",
    "MLOps",
    "MLflow",
    "Kubeflow",
    "Natural Language Processing (NLP)",
    "NLTK",
    "spaCy",
    "Computer Vision",
    "OpenCV",
    "Time Series Analysis",
    "Forecasting",
    "Prophet",
    "Data Cleaning",
    "Data Wrangling",
    "Feature Engineering",
    "Scikit-learn",
    "Statistical Analysis",
    "Hypothesis Testing",
    "Data Ethics",
    "Privacy",
    "GDPR",
    "Big Data",
    "Hadoop",
    "Spark",
    "Data Governance",
    "Data Quality",
    "Metadata Management",
    "Data Security",
    "Encryption

In [31]:
# Converting the output into JSON using parser

parser = JsonOutputParser()
json_answer = parser.parse(answer.content)
# json_answer = parser.parse(cleaned_answer)
json_answer

{'role': 'Cloud Support Intern',
 'experience': 'Internship',
 'skills': ['Cloud computing',
  'Database, Big Data, Analytics',
  'Networking (DNS, TCP/IP, HTTP, VLAN, etc.)',
  'OS (Linux and/or Windows Servers)',
  'Virtualization (VMware, Xen, Hypervisor)',
  'Security concepts / best practices',
  'Storage and Content Delivery',
  'Deployment',
  'Developer & Mobile Services (Serverless, Web Mobile, IoT)',
  'Programming / scripting experience (Java, Perl, Ruby, C#, and/or PHP)',
  'Korean/English Business Communication Skills'],
 'description': 'Amazon Web Services (AWS) is seeking a Cloud Support Intern to join their team in Korea. The successful candidate will have a strong focus on customer support, technical troubleshooting, and cloud computing. They will work with leading companies and internal development teams to resolve complex technical issues and drive customer interactions. The intern will also have the opportunity to learn and develop with guidance from their manager, 

In [33]:
# Checking the type of output for confirmation
print(type(json_answer))

<class 'dict'>


#### Importing Portfolio

In [36]:
# Function reads and formats the data in portfolio so that the LLM can understand

def read_file(path):
    current = []
    with open(path, 'r') as file:
        csv_reader = csv.reader(file)
        next(csv_reader)
        for line in csv_reader:
            # Formatting data
            skills = tuple(line[:-1])
            link = line[-1]
            current.append((skills, link))
        return current

In [38]:
# Sample Usage

path = 'demo_portfolio.csv'
data = read_file(path)


for skills, link in data:
    print(skills, link)

('Python', ' SQL', ' Pandas')  https://github.com/user/project1
('SQL', ' Python', ' Airflow')  https://github.com/user/project2
('PySpark', ' Spark SQL', ' Delta Lake')  https://github.com/user/project3
('Machine Learning', ' Deep Learning', ' TensorFlow')  https://github.com/user/project4
('Data Engineering', ' ETL', ' ELT')  https://github.com/user/project5
('Cloud Platforms (AWS', ' GCP', ' Azure)')  https://github.com/user/project6
('Data Warehousing', ' Data Modeling', ' DBT')  https://github.com/user/project7
('Data Visualization', ' Power BI', ' Tableau')  https://github.com/user/project8
('MLOps', ' MLflow', ' Kubeflow')  https://github.com/user/project9
('Natural Language Processing (NLP)', ' NLTK', ' spaCy')  https://github.com/user/project10
('Computer Vision', ' OpenCV', ' TensorFlow')  https://github.com/user/project11
('Time Series Analysis', ' Forecasting', ' Prophet')  https://github.com/user/project12
('Data Cleaning', ' Data Wrangling', ' Pandas')  https://github.com

#### Inserting data into Vector Database

In [77]:
# Chromadb Setup

client = chromadb.PersistentClient('vectorstore')
collection = client.get_or_create_collection(name= 'portfolio_links')

if not collection.count():
    for skills, links in data:
        collection.add(
            documents= str(skills),
            metadatas= {'portfolio_url': links},
            ids= [str(uuid.uuid4())]
        )

In [79]:
# Displays the first skills required in the sample job description

json_answer['skills'][0]

'Cloud computing'

In [81]:
# Queries the portfolio for a relevant project based on the first required skill as above

urls = collection.query(query_texts= json_answer['skills'][0], n_results= 2)
urls

{'ids': [['ef2c6826-4e72-4eab-ae48-bf665a2014b0',
   'e4f53b41-83e4-4e27-ae4a-8572cff2d108']],
 'embeddings': None,
 'documents': [["('Cloud Platforms (AWS', ' GCP', ' Azure)')",
   "('Cloud Data Engineering', ' AWS Glue', ' Databricks')"]],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[{'portfolio_url': ' https://github.com/user/project6'},
   {'portfolio_url': ' https://github.com/user/project20'}]],
 'distances': [[0.8320648074150085, 0.970432698726654]]}

## Setting up prompt for writing the email

In [88]:
# Assigning variable for the dob description in JSON

job_desc = json_answer['description']

# Prompt for the email

email_prompt = PromptTemplate.from_template(
    """
    I will give you a role and a task that you have to perform in that specific role.
    Your role: Your name is Fahim. You are an incredible business development officer who who knows how to get clients. You work for ABC Consulting Firm, your firm works with all kinds of IT clients and provide solutions in the domain of Data Science and AI.
    CoverLetter AI focuses on efficient tailored solutions for all clients keeping costs down. 
    Your Job: Your Job is to write tailored emails to clients regarding the Job openings that they have advertised. Try to pitch your clients with an email hook that opens a conversation about a possibility of working with them and why they should work with you (advantages and how you are better). Add the most relevant portfolio URLs from
    the following (shared below) to showcase that we have the right expertise to get the job done.
    Begin with a compelling subject line that references the job title and signals relevance. In the body of the email, include direct reference the job title and requirements to demonstrate your understanding of their needs, and personalize the message by mentioning the company or specific details from the job posting. 
    Briefly explain how ABC Consulting Firm’s expertise aligns with the role, using the most relevant portfolio URLs provided as specific examples of past success. 
    Clearly state ABC Consulting Firm’s unique value proposition and what sets your firm apart from others in the industry, demonstrating measurable value your firm can bring to their organization, such as results, efficiencies, or cost savings. 
    Keep the email professional, convincing and direct maintaining a professional, confident, and approachable tone. Include a clear call to action inviting the HR manager to discuss further or schedule a meeting.
    Proofread for correct grammar and spelling, limit the number of links to the most relevant one or two portfolio URLs to avoid spam filters, and avoid excessive formatting or attachments to ensure deliverability. Optionally, you may politely mention that you will follow up if you do not receive a response within a week.
    
    I will provide you with email and phone number. Email: abc@def.com, Phone: +12345678. Only include them at the end after the initials.
    I will now provide you with the Job description and the portfolio URLs:
    JOB DESCRIPTION: {job_description}
    ------
    PORTFOLIO URLS: {portfolio_urls}
    """
)

# Extracting the answer
chain_email = email_prompt | llm
answer = chain_email.invoke({'job_description': job_desc, 'portfolio_urls': urls})

In [89]:
# Email that has been generated

email = answer.content
print(email)

Subject: Application for Cloud Support Intern Role at Amazon Web Services (AWS) in Korea

Dear Hiring Manager,

I came across the Cloud Support Intern job posting at Amazon Web Services (AWS) in Korea and was impressed by the opportunity to work with a leading company in cloud computing. As a Business Development Officer at ABC Consulting Firm, I believe our expertise in Data Science and AI can bring significant value to your team.

The job requirements for a strong focus on customer support, technical troubleshooting, and cloud computing align perfectly with our portfolio. For instance, our experience in Cloud Platforms (AWS, GCP, Azure) and Cloud Data Engineering (AWS Glue, Databricks) can be seen in our successful projects, such as the ones showcased at https://github.com/user/project6. This project demonstrates our ability to design and implement scalable cloud solutions, which can be beneficial in resolving complex technical issues and driving customer interactions.

At ABC Consul