In [None]:
from elasticsearch import Elasticsearch, helpers
import pandas as pd
import os
import numpy as np
import time
import pdfplumber
import xml.etree.ElementTree as ET
from docx import Document
from win32com import client
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_google_genai import ChatGoogleGenerativeAI
import urllib3
from dotenv import load_dotenv


load_dotenv()
urllib3.disable_warnings()


ES_CONFIG = {
    "host": "https://localhost:9200",
    "username": "elastic",
    "password": os.getenv("ES_PASSWORD"),  
    "index": "task5"
}

es = Elasticsearch(
    ES_CONFIG["host"],
    basic_auth=(ES_CONFIG["username"], ES_CONFIG["password"]),
    verify_certs=False
)


csv_file_path = r"C:\Users\Divya_prasath\Desktop\task\extracted_data\resume.csv"

def push_data_to_elasticsearch(csv_file, es_client, index_name):
    if not os.path.exists(csv_file):
        print("CSV file not found!")
        return
    
    df = pd.read_csv(csv_file)
    df.replace({np.nan: None}, inplace=True)
    records = df.to_dict(orient="records")

    for record in records:
        try:
            es_client.create(index=index_name, id=str(time.time()), document=record)
            print(f"Inserted record into Elasticsearch")
        except Exception as e:
            print(f"Error inserting record: {e}")

push_data_to_elasticsearch(csv_file_path, es, ES_CONFIG["index"])



def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        return "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])

def extract_text_from_docx(docx_path):
    doc = Document(docx_path)
    return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])

def extract_text_from_doc(doc_path):
    try:
        word = client.Dispatch("Word.Application")
        word.Visible = False
        doc = word.Documents.Open(doc_path)
        text = doc.Content.Text.strip()
        doc.Close(False)
        word.Quit()
        return text
    except Exception as e:
        print(f"Error processing DOC file {doc_path}: {e}")
        return ""

def extract_text_from_xml(xml_path):
    try:
        tree = ET.parse(xml_path)
        root = tree.getroot()
        return " ".join([elem.text.strip() for elem in root.iter() if elem.text])
    except Exception as e:
        print(f"Error parsing XML {xml_path}: {e}")
        return ""

def extract_text_from_folder(folder_path):
    extracted_texts = {}
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        text = None

        try:
            if filename.lower().endswith(".pdf"):
                text = extract_text_from_pdf(file_path)
            elif filename.lower().endswith(".docx"):
                text = extract_text_from_docx(file_path)
            elif filename.lower().endswith(".doc"):
                text = extract_text_from_doc(file_path)
            elif filename.lower().endswith(".xml"):
                text = extract_text_from_xml(file_path)

            if text and text.strip():
                extracted_texts[filename] = text
            else:
                print(f"Skipping empty file: {filename}")
        except Exception as e:
            print(f"Error extracting text from {filename}: {e}")

    return extracted_texts

folder_path = r"C:\Users\Divya_prasath\Desktop\task\index resumes"
text_files = extract_text_from_folder(folder_path)


api_key = os.getenv("GOOGLE_API_KEY")  
if not api_key:
    raise ValueError("GEMINI_API_KEY is not set. Please set it as an environment variable.")

llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    api_key=api_key,
)


class Resume(BaseModel):
    name: str = Field(description="name from resume")
    phone: str = Field(description="phone number from resume")
    email: str = Field(description="email from resume")
    skill: str = Field(description="skill from resume")

parser = JsonOutputParser(pydantic_object=Resume)

prompt = PromptTemplate(
    template="Extract name, phone, email, skills from the given text resume.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | llm | parser

for filename, text in text_files.items():
    retries = 3
    success = False

    while retries > 0 and not success:
        try:
            extracted_data = chain.invoke({"query": text})
            extracted_details = Resume(**extracted_data) 
            doc = extracted_details.dict()  

            
            try:
                es.create(index=ES_CONFIG["index"], id=str(time.time()), document=doc)
                print(f"Inserted {filename} into Elasticsearch")
            except Exception as e:
                print(f"Error inserting {filename}: {e}")

            success = True
            time.sleep(1)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            retries -= 1
            time.sleep(2 ** (3 - retries))  

print("Data extraction and indexing completed successfully!")



For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)
  _transport = transport_class(


Inserted record into Elasticsearch
Inserted record into Elasticsearch
Inserted record into Elasticsearch
Inserted record into Elasticsearch
Inserted record into Elasticsearch
Inserted record into Elasticsearch
Inserted record into Elasticsearch
Inserted record into Elasticsearch
Inserted record into Elasticsearch
Inserted record into Elasticsearch
Inserted record into Elasticsearch
Inserted record into Elasticsearch
Inserted record into Elasticsearch
Inserted record into Elasticsearch
Inserted record into Elasticsearch
Inserted record into Elasticsearch
Inserted record into Elasticsearch
Inserted record into Elasticsearch
Inserted record into Elasticsearch
Inserted record into Elasticsearch
Inserted record into Elasticsearch
Inserted record into Elasticsearch
Inserted record into Elasticsearch
Inserted record into Elasticsearch
Inserted record into Elasticsearch
Inserted record into Elasticsearch
Inserted record into Elasticsearch
Inserted record into Elasticsearch
Inserted record into