In [1]:
# Working with new dataset 

In [36]:
import pandas as pd
df = pd.read_csv(r'../../higher_ed_employee_salaries.csv')
data = df.sample(700).to_dict('records') # Get only 700 records. More records will make it slower to index
len(data)

700

In [37]:
df.head()

Unnamed: 0,Name,School,Job Description,Department,Earnings,Year
0,Don Potter,University of Akron,Assistant Lecturer,Social Work,2472.0,2019
1,Emily Potter,The Ohio State University,Administrative Assistant 3,Arts and Sciences | Chemistry and Biochemistry...,48538.02,2022
2,Carol Jean Potter,The Ohio State University,Associate Professor-Clinical,Pediatrics,22722.8,2013
3,Kim Potter,The Ohio State University,"Manager 4, Compliance",Legal Affairs | Compliance,170143.44,2022
4,Graham Potter,Miami University,Building and Grounds Assistant,"Assoc VP Housing,Dining,Rec,Bus Svc",3075.2,2012


Index(['Name', 'School', 'Job Description', 'Department', 'Earnings', 'Year'], dtype='object')

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 934348 entries, 0 to 934347
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Name             934348 non-null  object 
 1   School           934348 non-null  object 
 2   Job Description  907680 non-null  object 
 3   Department       873896 non-null  object 
 4   Earnings         924673 non-null  float64
 5   Year             934348 non-null  int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 42.8+ MB


In [107]:
df['Earnings'] = df['Earnings'].astype('str')
df['Year'] = df['Year'].astype('str')

In [108]:
df = df.dropna()

In [109]:
df['Name_School_Job_Desc_Department'] = df['Name'] + ' ' +  df['Department'] + ' ' +  df['School'] + ' ' + df['Job Description'] + ' salary: ' + df['Earnings']

In [110]:
df['Name_School_Job_Desc_Department'].head()

0    Don Potter Social Work University of Akron Ass...
1    Emily Potter Arts and Sciences | Chemistry and...
2    Carol Jean Potter Pediatrics The Ohio State Un...
3    Kim Potter Legal Affairs | Compliance The Ohio...
4    Graham Potter Assoc VP Housing,Dining,Rec,Bus ...
Name: Name_School_Job_Desc_Department, dtype: object

In [111]:
data = df.sample(700).to_dict('records')

In [112]:
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer

In [113]:
encoder = SentenceTransformer('all-MiniLM-L6-v2') # Model to create embeddings

In [114]:
# create the vector database client
qdrant = QdrantClient(":memory:") # Create in-memory Qdrant instance

In [115]:
# Create collection to store wines
qdrant.recreate_collection(
    collection_name="top_salaries",
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(), # Vector size is defined by used model
        distance=models.Distance.COSINE
    )
)

True

In [116]:
# vectorize!
qdrant.upload_records(
    collection_name="top_salaries",
    records=[
        models.PointStruct(
            id=idx,
            vector=encoder.encode(doc['Name_School_Job_Desc_Department']).tolist(),
            payload=doc,
        ) for idx, doc in enumerate(data) # data is the variable holding all the wines
    ]
)

In [117]:
user_prompt = "Suggest me where to work to be well paid"

In [118]:
# Search time for awesome wines!

hits = qdrant.search(
    collection_name="top_salaries",
    query_vector=encoder.encode(user_prompt).tolist(),
    limit=3
)
for hit in hits:
  print(hit.payload, "score:", hit.score)

{'Name': 'Ebed M. Sulbaran Pena', 'School': 'Kent State University', 'Job Description': 'Project Director', 'Department': 'Teaching, Learning, & Curr Studies', 'Earnings': '2019', 'Year': '2019', 'Name_School_Job_Desc_Department': 'Ebed M. Sulbaran Pena Teaching, Learning, & Curr Studies Kent State University Project Director salary: 2019'} score: 0.3671825907738306
{'Name': 'Shailaja Paik', 'School': 'University Of Cincinnati', 'Job Description': 'Asst Professor - Prov', 'Department': 'A&S History', 'Earnings': '2013', 'Year': '2013', 'Name_School_Job_Desc_Department': 'Shailaja Paik A&S History University Of Cincinnati Asst Professor - Prov salary: 2013'} score: 0.3572317366399766
{'Name': 'Wendy D Haffey', 'School': 'University Of Cincinnati', 'Job Description': 'Research Scientist', 'Department': 'COM Can Bio Proteomics Lab', 'Earnings': '2014', 'Year': '2014', 'Name_School_Job_Desc_Department': 'Wendy D Haffey COM Can Bio Proteomics Lab University Of Cincinnati Research Scientist 

In [119]:
# define a variable to hold the search results
search_results = [hit.payload for hit in hits]

In [120]:
# Now time to connect to the local large language model
from openai import OpenAI
client = OpenAI(
    base_url="https://api.openai.com/v1", # "http://<Your api-server IP>:port"
    api_key = ""
)
completion = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "Your name is Milena, working as profesor"},
        {"role": "user", "content": "Suggest me an what kind of profession will best suited for me to be well paid according new data top_salaries in education, you can provide top 5 position in according collection top_salaries"},
        {"role": "assistant", "content": str(search_results)}
    ]
)
print(completion.choices[0].message)

ChatCompletionMessage(content='Based on the data from the "top_salaries" collection in the education sector, here are the top 5 positions that have demonstrated high earnings:\n\n1. Ebed M. Sulbaran Pena - Project Director at Kent State University in the Department of Teaching, Learning, & Curriculum Studies (2019 earnings).\n   \n2. Shailaja Paik - Assistant Professor - Provost at the University Of Cincinnati in the Department of Arts & Sciences (A&S) History (2013 earnings).\n   \n3. Wendy D. Haffey - Research Scientist at the University Of Cincinnati in the Department of COM Can Bio Proteomics Lab (2014 earnings).\n\nBased on this data, pursuing a career as a Project Director, Assistant Professor - Provost, or Research Scientist in the education sector could lead to high earning potential. Consider exploring opportunities in these fields to maximize your income potential.', role='assistant', function_call=None, tool_calls=None)
