In [6]:
import os
import pdfplumber
import pandas as pd
import time
import xml.etree.ElementTree as ET
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_google_genai import ChatGoogleGenerativeAI
from docx import Document 
from win32com import client 

# Function to extract text from different file formats
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        return "".join([page.extract_text() + "\n" for page in pdf.pages])

def extract_text_from_docx(docx_path):
    doc = Document(docx_path)
    return "\n".join([para.text for para in doc.paragraphs])

def extract_text_from_doc(doc_path):
    word = client.Dispatch("Word.Application")
    doc = word.Documents.Open(doc_path)
    text = doc.Content.Text
    doc.Close()
    word.Quit()
    return text

def extract_text_from_xml(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()
    return " ".join([elem.text for elem in root.iter() if elem.text])

def extract_text_from_folder(folder_path):
    extracted_texts = {}
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        text = None

        try:
            if filename.lower().endswith(".pdf"):
                text = extract_text_from_pdf(file_path)
            elif filename.lower().endswith(".docx"):
                text = extract_text_from_docx(file_path)
            elif filename.lower().endswith(".doc"):
                text = extract_text_from_doc(file_path)
            elif filename.lower().endswith(".xml"):
                text = extract_text_from_xml(file_path)
            
            if text:
                extracted_texts[filename] = text
        except Exception as e:
            print(f"Error extracting text from {filename}: {e}")

    return extracted_texts

# Folder containing resumes
folder_path = r"C:\Users\Divya_prasath\Downloads\Profiles 1"
text_files = extract_text_from_folder(folder_path)

# Initialize Gemini LLM model
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    api_key="AIzaSyC_1emKcfen2n5ZS-cjOTT37najYpBWmco",
)

# Define data structure for extracted resume details
class Resume(BaseModel):
    name: str = Field(description="name from resume")
    phone: str = Field(description="phone number from resume")
    email: str = Field(description="email from resume")
    skill: str = Field(description="skill from resume")

parser = JsonOutputParser(pydantic_object=Resume)

prompt = PromptTemplate(
    template="Extract name, phone, email, skills from the given text resume.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | llm | parser

details = []
for filename, text in text_files.items():
    retries = 3
    success = False

    while retries > 0 and not success:
        try:
            extracted_details = chain.invoke({"query": text})
            details.append(extracted_details)
            success = True
            time.sleep(1)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            retries -= 1
            time.sleep(2 ** (3 - retries))

# Convert extracted details into a DataFrame
df = pd.DataFrame(details)

# Define output folder and CSV file path
output_folder = r"C:\Users\Divya_prasath\Desktop\task\extracted_data"
os.makedirs(output_folder, exist_ok=True)
csv_output_path = os.path.join(output_folder, "resume.csv")

# Save extracted data to CSV file
df.to_csv(csv_output_path, index=False)
print(f"Extracted resume details saved to: {csv_output_path}")


Error extracting text from Naukri_JAYAKUMART[2y_5m].doc: (-2147221005, 'Invalid class string', None, None)
Error extracting text from Naukri_MrRamesh[12y_0m].doc: (-2147221005, 'Invalid class string', None, None)


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..


Error processing Naukri_DuvarakeshRavi[3y_3m] 1.pdf: 429 Resource has been exhausted (e.g. check quota).


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..


Error processing Naukri_DuvarakeshRavi[3y_3m] 1.pdf: 429 Resource has been exhausted (e.g. check quota).


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..


Error processing Naukri_JisinJoseph[1y_9m].pdf: 429 Resource has been exhausted (e.g. check quota).


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..


Error processing Naukri_JisinJoseph[1y_9m].pdf: 429 Resource has been exhausted (e.g. check quota).


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..


Error processing Naukri_JisinJoseph[1y_9m].pdf: 429 Resource has been exhausted (e.g. check quota).


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..


Error processing Naukri_RabinsinghR[3y_5m].pdf: 429 Resource has been exhausted (e.g. check quota).


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..


Error processing Naukri_RabinsinghR[3y_5m].pdf: 429 Resource has been exhausted (e.g. check quota).


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..


Error processing Naukri_RabinsinghR[3y_5m].pdf: 429 Resource has been exhausted (e.g. check quota).


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..


Error processing Naukri_SheikAbdulla[1y_3m] 1.docx: 429 Resource has been exhausted (e.g. check quota).


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..


Error processing Naukri_VetrivelMani[9y_0m].pdf: 429 Resource has been exhausted (e.g. check quota).


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..


Error processing Naukri_VetrivelMani[9y_0m].pdf: 429 Resource has been exhausted (e.g. check quota).


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..


Error processing Naukri_VigneshMohandas[5y_3m].pdf: 429 Resource has been exhausted (e.g. check quota).


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..


Extracted resume details saved to: C:\Users\Divya_prasath\Desktop\task\extracted_data\resume.csv


In [None]:
from elasticsearch import Elasticsearch, helpers
import pandas as pd
import os
import numpy as np
from dotenv import load_dotenv


load_dotenv()


ES_CONFIG = { 
    "host": "https://localhost:9200",
    "username": "elastic",
    "password": os.getenv("ES_PASSWORD"),  
    "index": "task3"
}

# Initialize Elasticsearch connection
es = Elasticsearch(
    ES_CONFIG["host"],
    basic_auth=(ES_CONFIG["username"], ES_CONFIG["password"]),
    verify_certs=False  
)

csv_file_path = r"C:\Users\Divya_prasath\Desktop\task\extracted_data\resume.csv"

def push_data_to_elasticsearch(csv_file, es_client, index_name):
    if not os.path.exists(csv_file):
        print("CSV file not found!")
        return
    
    df = pd.read_csv(csv_file)

    df.replace({np.nan: None}, inplace=True)

    records = df.to_dict(orient="records")

    actions = [
        {
            "_index": index_name,
            "_source": record
        }
        for record in records
    ]

    try:
        helpers.bulk(es_client, actions)
        print(f"Successfully indexed {len(records)} documents into Elasticsearch!")
    except helpers.BulkIndexError as e:
        print(f"Bulk indexing error: {e}")
        for error in e.errors:
            print(error)  

push_data_to_elasticsearch(csv_file_path, es, ES_CONFIG["index"])


  _transport = transport_class(


Successfully indexed 105 documents into Elasticsearch!


In [23]:
from elasticsearch import Elasticsearch, helpers
import os
import pandas as pd
import time
import numpy as np
import pdfplumber
import xml.etree.ElementTree as ET
from docx import Document
from win32com import client
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_google_genai import ChatGoogleGenerativeAI
import urllib3
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

urllib3.disable_warnings()

# Elasticsearch configuration
ES_CONFIG = {
    "host": "https://localhost:9200",
    "username": "elastic",
    "password": os.getenv("ES_PASSWORD"),  # Load password from .env file
    "index": "task3"
}

# Initialize Elasticsearch connection
es = Elasticsearch(
    ES_CONFIG["host"],
    basic_auth=(ES_CONFIG["username"], ES_CONFIG["password"]),
    verify_certs=False
)


def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        return "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])

def extract_text_from_docx(docx_path):
    doc = Document(docx_path)
    return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])

def extract_text_from_doc(doc_path):
    try:
        word = client.Dispatch("Word.Application")
        word.Visible = False
        doc = word.Documents.Open(doc_path)
        text = doc.Content.Text.strip()
        doc.Close(False)
        word.Quit()
        return text
    except Exception as e:
        print(f"Error processing DOC file {doc_path}: {e}")
        return ""

def extract_text_from_xml(xml_path):
    try:
        tree = ET.parse(xml_path)
        root = tree.getroot()
        return " ".join([elem.text.strip() for elem in root.iter() if elem.text])
    except Exception as e:
        print(f"Error parsing XML {xml_path}: {e}")
        return ""


def extract_text_from_folder(folder_path):
    extracted_texts = {}
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        text = None

        try:
            if filename.lower().endswith(".pdf"):
                text = extract_text_from_pdf(file_path)
            elif filename.lower().endswith(".docx"):
                text = extract_text_from_docx(file_path)
            elif filename.lower().endswith(".doc"):
                text = extract_text_from_doc(file_path)
            elif filename.lower().endswith(".xml"):
                text = extract_text_from_xml(file_path)

            if text and text.strip():
                extracted_texts[filename] = text
            else:
                print(f"Skipping empty file: {filename}")
        except Exception as e:
            print(f"Error extracting text from {filename}: {e}")

    return extracted_texts


folder_path = r"C:\Users\Divya_prasath\Desktop\task\index resumes"
text_files = extract_text_from_folder(folder_path)


api_key = os.getenv("GOOGLE_API_KEY")  
if not api_key:
    raise ValueError("GEMINI_API_KEY is not set. Please set it as an environment variable.")

llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    api_key=api_key,
)


class Resume(BaseModel):
    name: str = Field(description="name from resume")
    phone: str = Field(description="phone number from resume")
    email: str = Field(description="email from resume")
    skill: str = Field(description="skill from resume")

parser = JsonOutputParser(pydantic_object=Resume)

prompt = PromptTemplate(
    template="Extract name, phone, email, skills from the given text resume.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | llm | parser

for filename, text in text_files.items():
    retries = 3
    success = False

    while retries > 0 and not success:
        try:
            extracted_data = chain.invoke({"query": text})
            extracted_details = Resume(**extracted_data) 
            doc = extracted_details.dict()  

            es.index(index=ES_CONFIG["index"], document=doc)  
            print(f"Inserted {filename} into Elasticsearch")

            success = True
            time.sleep(1)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            retries -= 1
            time.sleep(2 ** (3 - retries))  

print("Data extraction and indexing completed successfully!")


  _transport = transport_class(


Inserted DhineshKumar_SDE_Resume.pdf into Elasticsearch
Inserted hi.pdf into Elasticsearch
Inserted Jaya Sanjay - Resume.pdf into Elasticsearch
Inserted NILESH SRINIVASAN.pdf into Elasticsearch
Data extraction and indexing completed successfully!
