In [2]:
import re
import json
import requests
from pathlib import Path
from ast import literal_eval
from typing import List, Dict

import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from langchain.llms import Clarifai
from langchain import PromptTemplate
from langchain.document_loaders import BSHTMLLoader
from langchain.document_loaders import UnstructuredURLLoader

In [3]:
TYPES_FILE_PATH = "./fields.json"
GUJARAT_SCHEMES_INDEX_PATH = "./gujarat_scheme_index.html"

In [4]:
def get_text_model():
    PAT = "2e47a12333004eafbd898b21e5bc94cf"
    USER_ID = "openai"
    APP_ID = "chat-completion"
    MODEL_ID = "GPT-4"
    llm = Clarifai(pat=PAT, user_id=USER_ID, app_id=APP_ID, model_id=MODEL_ID)
    return llm


llm = get_text_model()

# Utils

## Utils for scraping links

### Tamil Nadu Schemes

In [None]:
tn_scheme_urls = []
beneficiary_main_urls = [
    f"https://www.tn.gov.in/scheme/beneficiary_wise/{x}" for x in [2, 6, 8, 14, 18]
]
for url in beneficiary_main_urls:
    page = requests.get(url, verify=False)
    soup = BeautifulSoup(page.content, "html.parser")
    divs = soup.find_all("div", class_="scheme_list")
    tn_scheme_urls.extend(
        [div.find("a")["href"] for div in divs]
    )

### Gujarat Schemes

In [5]:
# the page required js to load link
# saved loaded page locally to find links
path = Path(GUJARAT_SCHEMES_INDEX_PATH)
html_content = path.read_text()
soup = BeautifulSoup(html_content, "html.parser")
scheme_div = soup.find("div", id="result")
gj_scheme_urls = [tag["href"] for tag in scheme_div.find_all("a")]

## Utils for Processing Scheme Web Pages

In [6]:
def read_web_page(scheme_link: str):
    loader = UnstructuredURLLoader([scheme_link], ssl_verify=False)
    document = loader.load()[0]
    # remove lines without any text
    lines = document.page_content.split("\n")
    filtered_lines = [line for line in lines if line.strip()]
    return "\n".join(filtered_lines)
    

def get_existing_types():
    with open(TYPES_FILE_PATH, "r") as f:
        field_to_types = json.load(f)
    return field_to_types


def get_types_suggestions(web_page_text: str, field_to_types: Dict[str, List[str]]):
    prompt_template = """You are a legal expert who is tasked with analyzing text from a
government scheme. Each scheme can be classified into some category for different fields
like gender, occupation, category and beneficiary. Analyze the provided text to 
determine if any new types are required for different fields. The current types are:
    
{current_types}

Text to Analyze: {text}

Instructions:
* For each category, check if the text can be categorized using existing types.
* If the text cannot be categorized, suggest a new type for the text else give an empty 
suggestion.

Output Format: 
* The output should be a json object containing suggestions for each field.
* Ensure all fields are present in the json. If the field does not require new types,
return an empty list.
* json object must wrapped around by "<json> </json>" tags
Example: <json>{{"beneficiary": ["value_1", "value_2", ...], "gender": ["value_1", "value_2", ...], "occupation": ["value_1", "value_2", ...], "category": ["value_1", "value_2", ...]}}</json>

Response:"""
    prompt = PromptTemplate(template=prompt_template, input_variables=["text", "current_types"])
    response = llm(prompt.format(text=web_page_text, current_types=field_to_types))
    json_pattern = f"(?<=<json>).*(?=</json>)"
    dict_string = re.search(json_pattern, response).group()
    types_suggestions = literal_eval(dict_string)
    return types_suggestions
    

def update_existing_types(field_to_new_types: Dict[str, List[str]]):
    with open(TYPES_FILE_PATH, "r+") as f:
        json_string = f.read()
        field_to_types = json.loads(json_string)
        for field, types in field_to_new_types.items():
            field_to_types[field].extend(types)
        f.seek(0)
        f.truncate()
        f.write(json.dumps(field_to_types))
    return field_to_types


def get_scheme_details(web_page_text: str, field_to_types: Dict[str, List[str]]):
    prompt_template = """You are a legal expert who is tasked with analyzing text from a
government scheme. Each scheme can be classified into some category for different fields
like gender, occupation, category and beneficiary. Analyze the provided text to extract 
eligibility criteria for each of the fields. The eligible options for each field are:

{current_types}

Text to Analyze: {text}

Output Format: Only return a json object wrapped around by "<json> </json>" tags
Example: <json>{{"beneficiary": ["value_1", "value_2", ...], "gender": ["value_1", "value_2", ...], "occupation": ["value_1", "value_2", ...], "category": ["value_1", "value_2", ...]}}</json>

Note: 
* Ensure that the extracted values for the eligibility criteria are from the provided options only.

Response:"""
    prompt = PromptTemplate(
        template=prompt_template, input_variables=["text", "current_types"]
    )
    response = llm(prompt.format(text=web_page_text, current_types=field_to_types))
    json_pattern = f"(?<=<json>).*(?=</json>)"
    dict_string = re.search(json_pattern, response).group()
    scheme_details = literal_eval(dict_string)
    return scheme_details
    

def get_scheme_details_and_update_types(scheme_link):
    web_page_text = read_web_page(scheme_link)
    # update types if required
    field_to_types = get_existing_types()
    types_suggestions = get_types_suggestions(web_page_text, field_to_types)
    is_confirmation_required = False
    for _, types in field_to_types.items():
        if len(types) > 0:
            is_confirmation_required = True
    if is_confirmation_required:
        confirmation = input((
            f"Suggestions: {json.dumps(types_suggestions, indent=4)}\n\n"
            "Do you want to accept(yes/no):"
        ))
        if confirmation == "yes":
           field_to_types = update_existing_types(types_suggestions)
    return get_scheme_details(web_page_text, field_to_types)

## Utils for Generating Summaries

In [7]:
def get_questions_prompt():
    question_template = """You are a legal expert and are tasked to analyze text from a government scheme.

text: {text}
    
Read the text very carefully and answer the following question: {question}

Output Format: The output should only contain the answer to the question. For question related to 
scheme name only return the scheme name. For descriptive answers, use simple and coherent language and 
answer the question using paragraph format.

Answer:"""
    return PromptTemplate(template=question_template, input_variables=["text", "question"])


def get_format_summary_prompt():
    format_summary_prompt_template = """You need to format the following summary of a government document.
summary: {text}
    
Output Format: The output should be in a text format. The section in the summary are: 
(Name of Scheme, Eligibility Criteria, Benefits of the Scheme, Steps to Avail, Source)
There should be markdown heading for each sections.If some information is present in multiple 
section, only keep that information in the most relevant sections.

Note: The summary should not have contain any double quotes (").

Formatted Summary:"""
    return PromptTemplate(template=format_summary_prompt_template, input_variables=["text"])


def generate_summary(text, link):
    summary = ""
    llm = get_text_model()
    questions = [
        ("Name", "What is the name of the government scheme?"),
        ("Eligibility Criteria", "Are the any eligibility criteria, validity, conditions to avail the scheme?"),
        ("Benefits", "What benefits does the government scheme provide?"),
        ("Steps to Avail", "What are the steps to avail or apply for the government scheme?")
    ]

    question_prompt = get_questions_prompt()
    format_summary_prompt = get_format_summary_prompt()

    for question in questions:
        summary += (f"{question[0]}: " + llm(question_prompt.format(text=text, question=question[1])) + "\n\n\n")
    summary += ("Source: " + link)
    
    return llm(format_summary_prompt.format(text=summary))


def translate_to_hindi(text):
    prompt_template = """Translate the following text to hindi language, 
while ensuring that there is no loss of information or changes in formatting.

Note: The translated text should not have contain any double quotes (").

text: {text}
    
Translated Text:"""
    prompt = PromptTemplate(template=prompt_template, input_variables=["text"])
    return llm(prompt.format(text=text))

# Creating DataFrame

In [None]:
data = []
for url in tqdm(tn_scheme_urls):
    text = read_web_page(url)
    try:
        details = get_scheme_details_and_update_types(url)  
        summary = generate_summary(text, url)
        hindi_summary = translate_to_hindi(summary)
        details["hindi_summary"] = hindi_summary
        details["summary"] = summary          
        details["text"] = text
        details["url"] = url
    except Exception as e:
        print(e)
        details = {"text": text}
        details["url"] = url    
    data.append(details)
df = pd.DataFrame(data)
df.to_csv("new_database.csv")

# Scratch

In [10]:
responses = """File: women_tn_2.html, Response: <json>{"gender": ["Female"], "occupation": ["None"], "category": ["None"]}</json>
File: women_tn_1.html, Response: <json>{"gender": ["Female"], "occupation": ["None"], "category": ["None"]}</json>
File: sc_st_tn_2.html, Response: <json>{"gender": ["None"], "occupation": ["None"], "category": ["SC", "ST"]}</json>
File: student_gj_3.html, Response: <json>{"gender": ["None"], "occupation": ["Student"], "category": ["None"]}</json>
File: student_gj_1.html, Response: <json>{"gender": ["Male", "Female"], "occupation": ["Student"], "category": ["None"]}</json>
File: students_tn_2.html, Response: <json>{"gender": ["None"], "occupation": ["Student"], "category": ["None"]}</json>
File: famers_tn_3.html, Response: <json>{"gender": ["None"], "occupation": ["Farmers"], "category": ["None"]}</json>
File: sc_st_tn_1.html, Response: <json>{"gender": ["None"], "occupation": ["Student"], "category": ["SC", "ST"]}</json>
File: woment_tn_3.html, Response: <json>{"gender": ["Female"], "occupation": ["None"], "category": ["None"]}</json>
File: students_gj_2.html, Response: <json>{"gender": ["None"], "occupation": ["Student"], "category": ["None"]}</json>
File: sc_st_tn_3.html, Response: <json>{"gender": ["None"], "occupation": ["Student"], "category": ["SC", "ST"]}</json>
File: famers_tn_2.html, Response: <json>{"gender": ["None"], "occupation": ["Farmers"], "category": ["None"]}</json>
File: famers_tn_1.html, Response: <json>{"gender": ["None"], "occupation": ["Farmers"], "category": ["None"]}</json>
File: students_tn_1.html, Response: <json>{"gender": ["None"], "occupation": ["Farmers", "Student"], "category": ["None"]}</json>
File: students_tn_3.html, Response: <json>{"gender": ["None"], "occupation": ["Student"], "category": ["None"]}</json>"""

In [46]:
def get_page_link(html_content):
    soup = BeautifulSoup(html_content, "html.parser")
    if "gujarat" in html_content:
        tag = soup.find("a", {"title": "Skip to Main Content"})
        pattern = r"https://sje.gujarat.gov.in/.{4}/schemes/\d+"
        link =re.search(pattern, tag["href"]).group()
        return link
    else:
        tag = soup.find("a", class_="increase")
        return tag["href"][:-1]


def parse_response(response):
    file_pattern = r"(?<=File: ).*html"
    file_name = re.search(file_pattern, response).group()
    dict_pattern = f"(?<=<json>).*(?=</json>)"
    dict_string = re.search(dict_pattern, response).group()
    response = literal_eval(dict_string)
    response["file_name"] = file_name
    return response


data = []
for response, path, doc in zip(responses.split("\n"), html_paths, docs):
    file_content = Path(path).read_text()
    page_link = get_page_link(file_content)
    record = parse_response(response)
    record["url"] = page_link
    record["text"] = doc
    data.append(record)
    
pd.DataFrame(data).to_csv("database.csv")    

In [15]:
def get_hindi_model(pat):
    USER_ID = "facebook"
    APP_ID = "translation"
    MODEL_ID = "translation-english-to-hindi-text"
    llm = Clarifai(pat=pat, user_id=USER_ID, app_id=APP_ID, model_id=MODEL_ID)
    return llm

def translate_to_hindi(text):
    model = get_hindi_model("2e47a12333004eafbd898b21e5bc94cf")
    return model(text)

df.loc[:, "hindi_summary"] = df.apply(lambda x: translate_to_hindi(x["Summary"]), axis=1)


In [None]:
eligibility_prompt_template = """You are a legal expert and are tasked to identify if a user
is eligible for a given government scheme.

Government Scheme: {scheme}

User Description: {description}
User Gender: {gender}
User Occupation: {occupation}
User Category: {category}

Based on the provided user information and the description of the government scheme, 
please assess whether the user is eligible for the scheme and respond with a 'Yes' or 'No' accordingly.

Note: the response should only be a single word yes / no, nothing else.

Response:"""

In [None]:
llm = get_text_model()
def translate_to_hindi(text):
    format_summary_prompt_template = """Translate the following text to hindi language, while ensuring that there is no
loss of information or changes in formatting.

text: {text}
    
translated text:"""
    prompt = PromptTemplate(template=format_summary_prompt_template, input_variables=["text"])
    return llm(prompt.format(text=text))

df.loc[:, "hindi_summary"] = df.apply(lambda x: translate_to_hindi(x["Summary"]), axis=1)

In [None]:
hindi_summaries = []

for idx, row in df.iterrows():
    try:
        summary = translate_to_hindi(row["Summary"])
        hindi_summaries.append(summary)
    except Exception as e:
        hindi_summaries.append("Error")

print(hindi_summaries)