# RESUME PDF , WORD TO EXTRACT TEXT

In [1]:
import pandas as pd
import pdfplumber
import re
import os
from docx import Document  # Import for handling Word documents
import nltk  # Ensure you have NLTK installed for sentence tokenization

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    full_text = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                # Remove unwanted punctuation including specific symbols and unwanted characters
                cleaned_text = re.sub(r'[●/\\\xa0]', '', text)  # Remove specific punctuation
                cleaned_text = re.sub(r'[\n\t(){}:,"\'.,]', '', cleaned_text)  # Remove other punctuation
                cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)  # Remove emojis and any other non-word characters
                full_text.append(cleaned_text)
    return ' '.join(full_text)  # Return the full text as a single string

# Function to extract text from a Word document
def extract_text_from_word(doc_path):
    full_text = []
    doc = Document(doc_path)
    
    for para in doc.paragraphs:
        line = re.sub(r'[●/\\\xa0]', '', para.text)  # Remove specific punctuation
        line = re.sub(r'[\n\t(){}:,"\'.,]', '', line)  # Remove other punctuation
        line = re.sub(r'[^\w\s]', '', line)  # Remove emojis and any other non-word characters
        full_text.append(line)
    
    return ' '.join(full_text)  # Return the full text as a single string

# Function to create chunks based on sentences
def create_chunks(text, max_chunk_length=250):
    sentences = nltk.sent_tokenize(text)  # Tokenize text into sentences
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) < max_chunk_length:
            current_chunk += " " + sentence
        else:
            chunks.append(current_chunk.strip())  # Append the current chunk
            current_chunk = sentence  # Start a new chunk with the current sentence

    if current_chunk:  # Append the last chunk if any
        chunks.append(current_chunk.strip())

    return chunks

# Specify the folder path containing the resumes
folder_path = r"C:\mvi\New folder (4)\resume 73 pdf ,word"

# Initialize a list to hold all extracted data
all_data = []

# Iterate over each file in the directory
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    resume_data = {
        "filename": filename,
        "content": ""
    }
    
    if filename.endswith('.pdf'):
        print(f"Processing PDF: {filename}")
        pdf_text = extract_text_from_pdf(file_path)
        resume_data["content"] = pdf_text
    elif filename.endswith('.docx'):
        print(f"Processing Word document: {filename}")
        word_text = extract_text_from_word(file_path)
        resume_data["content"] = word_text
    
    all_data.append(resume_data)

# Process each resume and create chunks
for resume in all_data:
    print(f"Filename: {resume['filename']}")
    chunks = create_chunks(resume['content'])
    for chunk in chunks:
        print(f"Chunk: {chunk}\n")
        
print("Data extraction and structuring complete.")


  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


Processing Word document: Aarthi Data Engineer.docx
Processing Word document: Adi Vishnu M365.docx
Processing PDF: AG SHAH.pdf
Processing Word document: Akhil Business Analyst.docx
Processing Word document: Akilesh K.docx
Processing Word document: Aman Kansal.docx
Processing Word document: Aniket SSIS Data Engineer.docx
Processing Word document: Asad Khan M365 Architect.docx
Processing PDF: Avinash Kormatha.pdf
Processing Word document: BALAVANI TARIGONDA.docx
Processing Word document: Basanta Sigdel.docx
Processing Word document: BA_HenokD2022_updated_copy (1).docx
Processing Word document: BA_HenokD2022_updated_copy.docx
Processing Word document: Bharath koppera (1).docx
Processing Word document: Bharath koppera.docx
Processing Word document: Brilland - Java Developer.docx
Processing Word document: Chandu MuleSoft Developer.docx
Processing PDF: chemist_job_ref.pdf
Processing Word document: Chiranjeevi DevOps Engineer.docx
Processing Word document: Christopher Telecom Specialist - GA.

# connect to python pymongo atlas

In [6]:
!pip install pymongo





In [7]:
from pymongo import MongoClient

In [8]:
# Connect to MongoDB
client = MongoClient('mongodb+srv://privasapien:privasapien@cluster0.01ejx.mongodb.net/')


In [9]:
db = client['db1']
collection = db['privasapien_project']

In [13]:
# Remove duplicates from the collection
pipeline = [
    {"$group": {
        "_id": {"filename": "$filename", "chunk": "$chunk"},
        "count": {"$sum": 1}
    }},
    {"$match": {"count": {"$gt": 1}}}
]

duplicates = collection.aggregate(pipeline)

for doc in duplicates:
    # Remove all duplicates except one
    collection.delete_many({"filename": doc["_id"]["filename"], "chunk": doc["_id"]["chunk"]})

# Now, create the unique index
collection.create_index([("filename", 1), ("chunk", 1)], unique=True)


'filename_1_chunk_1'

In [14]:
# Process each resume and create chunks, then upload to MongoDB
for resume in all_data:
    print(f"Filename: {resume['filename']}")
    chunks = create_chunks(resume['content'])
    for chunk in chunks:
        if not chunk.strip():  # Skip empty chunks
            print(f"Skipping empty chunk for {resume['filename']}\n")
            continue
        
        document = {
            "filename": resume['filename'],
            "chunk": chunk
        }
        try:
            # Insert the document into the collection
            collection.insert_one(document)
            print(f"Inserted chunk into MongoDB: {chunk}\n")
        except Exception as e:
            print(f"Error inserting chunk: {e}\n")


Filename: Aarthi Data Engineer.docx
Skipping empty chunk for Aarthi Data Engineer.docx

Inserted chunk into MongoDB: AARTHI AWASTHI aarthi211097gmailcom 316 530 1525 SUMMARY Over 7 years of IT experience in Design Development Maintenance and Support of Big Data Applications Experience in Data Engineering Data Pipeline Design Development and Implementation as a Data EngineerData Developer and Data Modeler Optimized data queries and data processing tasks within Azure Synapse Analytics and Azure Data Factory improving performance and efficiency Experience on Migrating SQL database to Azure Data Lake Azure Data Lake Analytics Azure SQL Database Azure Data Bricks and Azure SQL Data Warehouse and controlling and granting database access and Migrating Onpremises databases to Azure Data Lake store using Azure Data Factory Strong experience in Software Development Life Cycle SDLC including Requirements Analysis Design Specification and Testing as per Cycle in both Waterfall and Agile methodolog

# download the output csv file

In [19]:
from IPython.display import FileLink
all_data.append(resume_data)

# Convert all_data to a DataFrame
df = pd.DataFrame(all_data)

# Save to CSV
csv_file_path = "all_data.csv"
df.to_csv(csv_file_path, index=False)

# Create a download link
display(FileLink(csv_file_path))

# download the output text file

In [16]:
 all_data.append(resume_data)

# Save all_data to a text file
text_file_path = "all_data.txt"
with open(text_file_path, 'w', encoding='utf-8') as f:
    for resume in all_data:
        f.write(f"Filename: {resume['filename']}\n")
        f.write(f"Content: {resume['content']}\n\n")

# Create a download link
display(FileLink(text_file_path))