In [45]:
import os
import re
from pathlib import Path
from ast import literal_eval

import pandas as pd
from bs4 import BeautifulSoup
from langchain.llms import Clarifai
from langchain import PromptTemplate
from langchain.document_loaders import BSHTMLLoader


In [6]:

def get_text_model():
    PAT = ""
    USER_ID = "openai"
    APP_ID = "chat-completion"
    MODEL_ID = "GPT-4"
    llm = Clarifai(pat=PAT, user_id=USER_ID, app_id=APP_ID, model_id=MODEL_ID)
    return llm

def read_html_files(html_paths):
    docs = []
    for path in html_paths:
        loader = BSHTMLLoader(path)
        doc = loader.load()[0]
        lines = doc.page_content.split("\n")
        filtered_lines = [line for line in lines if line.strip()]
        docs.append("\n".join(filtered_lines))
    return docs

def get_prompt(text):
    prompt_template = """Analyze the provided text to extract eligibility criteria for 
the following fields: Gender, Occupations, and Category, based on information from a 
government scheme. The eligible options for each field are as follows:
1. Gender: ["Male", "Female", "Others", "None"]
2. Occupations: ["Student", "Farmers", "Retired", "None"]
3. Category: ["SC", "ST", "OBC", "General", "None"]

Text to Analyze: {text}


Output Format: Only return a json object wrapped around by "<json> </json>" tags

Example: <json>{{"gender": ["value_1", "value_2", ...], "occupation": ["value_1", "value_2", ...], "category": ["value_1", "value_2", ...]}}</json>

Note: 
* Ensure that the extracted values for the eligibility criteria are from the provided options only.

Response:"""
    prompt = PromptTemplate(template=prompt_template, input_variables=["text"])
    return prompt.format(text=text)


In [9]:
llm = get_text_model()
data_path = "/home/vk/data/temp/data"
html_paths = [os.path.join(data_path, path) 
              for path in os.listdir(data_path) if ".html" in path]
docs = read_html_files(html_paths)
for doc, path in zip(docs, html_paths):
    prompt = get_prompt(doc)
    response = llm(prompt)
    print(f"File: {os.path.basename(path)}, Response: {response}")



File: women_tn_2.html, Response: <json>{"gender": ["Female"], "occupation": ["None"], "category": ["None"]}</json>
File: women_tn_1.html, Response: <json>{"gender": ["Female"], "occupation": ["None"], "category": ["None"]}</json>
File: sc_st_tn_2.html, Response: <json>{"gender": ["None"], "occupation": ["None"], "category": ["SC", "ST"]}</json>
File: student_gj_3.html, Response: <json>{"gender": ["None"], "occupation": ["Student"], "category": ["None"]}</json>
File: student_gj_1.html, Response: <json>{"gender": ["Male", "Female"], "occupation": ["Student"], "category": ["None"]}</json>
File: students_tn_2.html, Response: <json>{"gender": ["None"], "occupation": ["Student"], "category": ["None"]}</json>
File: famers_tn_3.html, Response: <json>{"gender": ["None"], "occupation": ["Farmers"], "category": ["None"]}</json>
File: sc_st_tn_1.html, Response: <json>{"gender": ["None"], "occupation": ["Student"], "category": ["SC", "ST"]}</json>
File: woment_tn_3.html, Response: <json>{"gender": 

In [10]:
responses = """File: women_tn_2.html, Response: <json>{"gender": ["Female"], "occupation": ["None"], "category": ["None"]}</json>
File: women_tn_1.html, Response: <json>{"gender": ["Female"], "occupation": ["None"], "category": ["None"]}</json>
File: sc_st_tn_2.html, Response: <json>{"gender": ["None"], "occupation": ["None"], "category": ["SC", "ST"]}</json>
File: student_gj_3.html, Response: <json>{"gender": ["None"], "occupation": ["Student"], "category": ["None"]}</json>
File: student_gj_1.html, Response: <json>{"gender": ["Male", "Female"], "occupation": ["Student"], "category": ["None"]}</json>
File: students_tn_2.html, Response: <json>{"gender": ["None"], "occupation": ["Student"], "category": ["None"]}</json>
File: famers_tn_3.html, Response: <json>{"gender": ["None"], "occupation": ["Farmers"], "category": ["None"]}</json>
File: sc_st_tn_1.html, Response: <json>{"gender": ["None"], "occupation": ["Student"], "category": ["SC", "ST"]}</json>
File: woment_tn_3.html, Response: <json>{"gender": ["Female"], "occupation": ["None"], "category": ["None"]}</json>
File: students_gj_2.html, Response: <json>{"gender": ["None"], "occupation": ["Student"], "category": ["None"]}</json>
File: sc_st_tn_3.html, Response: <json>{"gender": ["None"], "occupation": ["Student"], "category": ["SC", "ST"]}</json>
File: famers_tn_2.html, Response: <json>{"gender": ["None"], "occupation": ["Farmers"], "category": ["None"]}</json>
File: famers_tn_1.html, Response: <json>{"gender": ["None"], "occupation": ["Farmers"], "category": ["None"]}</json>
File: students_tn_1.html, Response: <json>{"gender": ["None"], "occupation": ["Farmers", "Student"], "category": ["None"]}</json>
File: students_tn_3.html, Response: <json>{"gender": ["None"], "occupation": ["Student"], "category": ["None"]}</json>"""

In [46]:
def get_page_link(html_content):
    soup = BeautifulSoup(html_content, "html.parser")
    if "gujarat" in html_content:
        tag = soup.find("a", {"title": "Skip to Main Content"})
        pattern = r"https://sje.gujarat.gov.in/.{4}/schemes/\d+"
        link =re.search(pattern, tag["href"]).group()
        return link
    else:
        tag = soup.find("a", class_="increase")
        return tag["href"][:-1]


def parse_response(response):
    file_pattern = r"(?<=File: ).*html"
    file_name = re.search(file_pattern, response).group()
    dict_pattern = f"(?<=<json>).*(?=</json>)"
    dict_string = re.search(dict_pattern, response).group()
    response = literal_eval(dict_string)
    response["file_name"] = file_name
    return response


data = []
for response, path, doc in zip(responses.split("\n"), html_paths, docs):
    file_content = Path(path).read_text()
    page_link = get_page_link(file_content)
    record = parse_response(response)
    record["url"] = page_link
    record["text"] = doc
    data.append(record)
    
pd.DataFrame(data).to_csv("database.csv")    

In [43]:
def get_text_model(pat):
    USER_ID = "facebook"
    APP_ID = "translation"
    MODEL_ID = "translation-english-to-hindi-text"
    llm = Clarifai(pat=pat, user_id=USER_ID, app_id=APP_ID, model_id=MODEL_ID)
    return llm

llm = get_text_model("2e47a12333004eafbd898b21e5bc94cf")
llm("""* store as csv
* read with pd.eval on req colums
* we have our dataset ready

* ask for fields from user gender, occupation, category
* check if a field is not none
* filter using following condition, either data should have none or should have the required entry in list""")


'* CSV के रूप में संग्रहीत * req स्तंभों पर pd.eval के साथ पढ़ें * हमारे डेटासेट तैयार हैं * उपयोगकर्ता लिंग, व्यवसाय, श्रेणी से फ़ील्ड के लिए पूछें * जांचें यदि फ़ील्ड कोई नहीं है * फ़िल्टर निम्नलिखित स्थिति का उपयोग करके, या तो डेटा में कोई नहीं होना चाहिए या सूची में आवश्यक इनपुट होना चाहिए'