In [0]:
%pip install mlflow==2.14.3 langchain==0.1.5 langdetect databricks-vectorsearch==0.22 databricks-sdk==0.18.0 mlflow[databricks]
dbutils.library.restartPython()

Collecting mlflow==2.14.3
  Downloading mlflow-2.14.3-py3-none-any.whl (25.8 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 25.8/25.8 MB 42.1 MB/s eta 0:00:00
Collecting langchain==0.1.5
  Downloading langchain-0.1.5-py3-none-any.whl (806 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 806.7/806.7 kB 54.3 MB/s eta 0:00:00
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 981.5/981.5 kB 41.9 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting databricks-vectorsearch==0.22
  Downloading databricks_vectorsearch-0.22-py3-none-any.whl (8.5 kB)
Collecting databricks-sdk==0.18.0
  Downloading databricks_sdk-0.18.0-py3-none-any.whl (439 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 439.5/439.5 kB 36.2 MB/s eta 0:00:00
Collecting mlflow[databricks]
  Downloading mlflow-2.17.2-py3-none-any.whl (26.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 26.

In [0]:
import os
from langdetect import detect
from databricks.vector_search.client import VectorSearchClient
from langchain_community.vectorstores import DatabricksVectorSearch
from langchain_community.embeddings import DatabricksEmbeddings
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_community.chat_models import ChatDatabricks

##### Retrieve Vector Search

In [0]:
embedding_model = DatabricksEmbeddings(endpoint="databricks-bge-large-en")

def get_retriever(persist_dir: str = None):
    workspace_url = os.environ.get("WORKSPACE_URL")
    sp_client_id = os.environ.get("SP_CLIENT_ID")
    sp_client_secret = os.environ.get("SP_CLIENT_SECRET")

    vsc = VectorSearchClient(
        workspace_url=workspace_url,
        service_principal_client_id=sp_client_id,
        service_principal_client_secret=sp_client_secret,
        disable_notice= True
    )

    vs_index = vsc.get_index(endpoint_name="doc_vector_endpoint", index_name="datascience_dev.default.docs_idx")

    # Create the retriever
    vectorstore = DatabricksVectorSearch(
        vs_index, text_column="text", embedding=embedding_model
    )
    return vectorstore.as_retriever()

##### Function to detect language

In [0]:
def get_language_name(question):
    lang_code= detect(question)

    language_map = {
        'es': 'Spanish',
        'en': 'English',
        'vi': 'Vietnamese',
        'fr': 'French',
        'de': 'German',
        # Add more languages as needed
    }
    return language_map.get(lang_code, 'Unknown')

##### Multi-Lingual ChatBot

In [0]:
def get_multilingual_responses(question):

    prompt_template = """You are an assistant for international users. You are answering advicing career guidance based on the  questions asked about the available data.If the question is not related to one of these topics, kindly decline to answer.If you don't know the answer, just say that you don't know, don't try to make up an answer. If the question appears to be for data that you don't have data on, say so.  Keep the answer as concise as possible. 

    Respond in {detected_language}.
    Use the following pieces of context to answer the question at the end: {context}

    Question: {question}

    STRICT RESPONSE RULES:
    1. MUST use ONLY {detected_language} words
    2. MUST maintain proper {detected_language} grammar and punctuation
    3. Write naturally as a native {detected_language} speaker would write
    4. Use proper accent marks and special characters for {detected_language}
    5. MUST NOT include ANY English terms (except for technical terms with no {detected_language} equivalent)

    Your {detected_language} response:
    """

    # Detect language of the question
    detected_language= get_language_name(question)
    
    #prompt
    prompt = PromptTemplate(
        template=prompt_template,
        input_variables=["context", "question"],
        partial_variables={"detected_language": detected_language}
    )

    #llm
    chat_model = ChatDatabricks(endpoint="databricks-dbrx-instruct", max_tokens = 200)

    #chain
    qa_chain = RetrievalQA.from_chain_type(
        llm=chat_model,
        chain_type="stuff",
        retriever=get_retriever(),
        chain_type_kwargs={"prompt": prompt,"verbose":False}
    )

    return qa_chain.run(question)


### ----- Chat with the bot in multiple languages -----

In [0]:
english_question="What is the pay difference between a data scientist and a database administrator role?"
print("\n", get_multilingual_responses(english_question))




 Based on the data provided, the annual mean wage for Data Scientists is $119,040, while for Database Administrators it is $104,810. Therefore, the pay difference between a Data Scientist and a Database Administrator role is approximately $14,230.


In [0]:
spanish_question="¿Cuál es la diferencia salarial entre un científico de datos y un administrador de base de datos?"
print("\n", get_multilingual_responses(spanish_question))




 La diferencia salarial entre un científico de datos y un administrador de base de datos puede variar dependiendo de varios factores, como la ubicación geográfica, el tamaño y la industria de la empresa, y la experiencia y educación del profesional. Sin embargo, según los datos disponibles hasta diciembre de 2023, un científico de datos tiene un salario medio anual de $62,370, mientras que un administrador de base de datos tiene un salario medio anual de $53,790. Es importante tener en cuenta que estos números son solo una estimación y que los salarios reales pueden variar.


In [0]:
vietnamese_question="Sự khác biệt về lương giữa vai trò nhà khoa học dữ liệu và vai trò quản trị viên cơ sở dữ liệu là gì?"
print("\n", get_multilingual_responses(vietnamese_question))




 Theo dữ liệu của Cục Thống Kê Lao Động Hoa Kỳ, mức lương trung bình hàng năm của nhà khoa học dữ liệu là 98.000 đô la, trong khi đó, mức lương trung bình hàng năm của quản trị viên cơ sở dữ liệu là 77.000 đô la. Do đó, nhà khoa học dữ liệu thường có mức lương cao hơn so với quản trị viên cơ sở dữ liệu.


In [0]:
question="What visa do I need to enter US for a Data Scientist job?"
print("\n",get_multilingual_responses(question))




 To work as a Data Scientist in the US, you would typically need a non-immigrant work visa such as the H-1B visa. This visa is for people in a specified professional or academic field or with special expertise who have a college degree or higher or the equivalent in work experience. The job should qualify as a specialty occupation, generally requiring a bachelor’s degree or higher. Your employer must show that there is a lack of qualified U.S. applicants for the role. Please note that the process involves a multi-step process, including finding H-1B sponsorship, filing a labor condition application, and submitting a petition to USCIS.


In [0]:
question =  "My husband wants to pursue a job in Data Science. Which state in US is the best place for him? If he gets the job, what are my future job prospectives in the US in the same field. What are the visa options i need to pursue? "
print("\n", get_multilingual_responses(question))




 Based on the data provided, the state with the highest employment for data scientists is California, with 119,040 jobs and an annual mean wage of $119,040. This could be a good place for your husband to pursue a job in data science. As for your future job prospects in the same field, the data shows that there are 192,710 data scientists employed in the US with an annual mean wage of $119,040. This indicates that there are opportunities available for you as well.

As for visa options, if your husband is offered a job in the US, his employer can sponsor him for a temporary non-immigrant visa such as the H-1B visa. This visa is for workers in specialty occupations, which includes data science. As his spouse, you would be eligible for an H-4 visa, which allows you to live and work in the US. However, you would need to apply for an Employment Authorization Document (EAD) to be able to work
