# UNSTRUCTURED DATA TO EXTRACT TEXT

In [1]:
import pandas as pd
import pdfplumber
import re
import os
from docx import Document
import nltk
import textwrap  # Import for wrapping text neatly

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    full_text = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                cleaned_text = re.sub(r'[●/\\\xa0]', '', text)  # Remove specific punctuation
                cleaned_text = re.sub(r'[\n\t(){}:,"\'.,]', '', cleaned_text)  # Remove other punctuation
                cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)  # Remove emojis and non-word characters
                full_text.append(cleaned_text)
    return ' '.join(full_text)  # Join text from all pages

# Function to extract text from a Word document
def extract_text_from_word(doc_path):
    full_text = []
    doc = Document(doc_path)
    
    for para in doc.paragraphs:
        line = re.sub(r'[●/\\\xa0]', '', para.text)  # Remove specific punctuation
        line = re.sub(r'[\n\t(){}:,"\'.,]', '', line)  # Remove other punctuation
        line = re.sub(r'[^\w\s]', '', line)  # Remove emojis and other non-word characters
        full_text.append(line)
    
    return ' '.join(full_text)

# Function to create chunks based on sentences
def create_chunks(text, max_chunk_length=100):
    sentences = nltk.sent_tokenize(text)  # Tokenize text into sentences
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) < max_chunk_length:
            current_chunk += " " + sentence
        else:
            chunks.append(current_chunk.strip())  # Add current chunk
            current_chunk = sentence  # Start a new chunk with the current sentence

    if current_chunk:  # Add the last chunk
        chunks.append(current_chunk.strip())

    return chunks

# Function to wrap text for better alignment
def format_chunks(chunk):
    wrapped_text = textwrap.fill(chunk, width=80)  # Wrap the text at 80 characters width
    return wrapped_text

# Folder path containing the resumes
folder_path = r"C:/mvi/New folder (4)/resume 2"

# Initialize a list to hold all extracted data
all_data = []

# Iterate over each file in the directory
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    resume_data = {
        "filename": filename,
        "content": ""
    }
    
    if filename.endswith('.pdf'):
        print(f"Processing PDF: {filename}")
        pdf_text = extract_text_from_pdf(file_path)
        resume_data["content"] = pdf_text
    elif filename.endswith('.docx'):
        print(f"Processing Word document: {filename}")
        word_text = extract_text_from_word(file_path)
        resume_data["content"] = word_text
    
    all_data.append(resume_data)

# Process each resume and create chunks
for resume in all_data:
    print(f"Filename: {resume['filename']}")
    chunks = create_chunks(resume['content'])
    for chunk in chunks:
        formatted_chunk = format_chunks(chunk)  # Format chunk for better readability
        print(f"Chunk:\n{formatted_chunk}\n")
        
print("Data extraction and structuring complete.")

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


Processing Word document: Aarthi Data Engineer.docx
Processing PDF: Avinash Kormatha.pdf
Filename: Aarthi Data Engineer.docx
Chunk:


Chunk:
AARTHI AWASTHI aarthi211097gmailcom 316 530 1525 SUMMARY Over 7 years of IT
experience in Design Development Maintenance and Support of Big Data
Applications Experience in Data Engineering Data Pipeline Design Development and
Implementation as a Data EngineerData Developer and Data Modeler Optimized data
queries and data processing tasks within Azure Synapse Analytics and Azure Data
Factory improving performance and efficiency Experience on Migrating SQL
database to Azure Data Lake Azure Data Lake Analytics Azure SQL Database Azure
Data Bricks and Azure SQL Data Warehouse and controlling and granting database
access and Migrating Onpremises databases to Azure Data Lake store using Azure
Data Factory Strong experience in Software Development Life Cycle SDLC including
Requirements Analysis Design Specification and Testing as per Cycle in both
Waterf

# connect with mongodb atlas

In [2]:
from pymongo import MongoClient

In [3]:
# Connect to MongoDB
client = MongoClient('mongodb+srv://unstructured:unstructured@cluster0.01ejx.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0')

In [4]:
db = client['db2']
collection = db['unstructure_extract_texts']

In [5]:
# Remove duplicates from the collection
pipeline = [
    {"$group": {
        "_id": {"filename": "$filename", "chunk": "$chunk"},
        "count": {"$sum": 1}
    }},
    {"$match": {"count": {"$gt": 1}}}
]

duplicates = collection.aggregate(pipeline)

for doc in duplicates:
    # Remove all duplicates except one
    collection.delete_many({"filename": doc["_id"]["filename"], "chunk": doc["_id"]["chunk"]})

# Now, create the unique index
collection.create_index([("filename", 1), ("chunk", 1)], unique=True)


'filename_1_chunk_1'

In [7]:
# Process each resume and create chunks, then upload to MongoDB
for resume in all_data:
    print(f"Filename: {resume['filename']}")
    chunks = create_chunks(resume['content'])
    for chunk in chunks:
        if not chunk.strip():  # Skip empty chunks
            print(f"Skipping empty chunk for {resume['filename']}\n")
            continue
        
        document = {
            "filename": resume['filename'],
            "chunk": chunk
        }
        try:
            # Insert the document into the collection
            collection.insert_one(document)
            print(f"Inserted chunk into MongoDB: {chunk}\n")
        except Exception as e:
             print(f"Chunk:\n{formatted_chunk}\n")
                    


Filename: Aarthi Data Engineer.docx
Skipping empty chunk for Aarthi Data Engineer.docx

Chunk:
Avinash KormathaPhone 512 8511476Email avinashkormathagmailcomPROFESSIONAL
SUMMARY Over 6 years of experience in design development and analysis of Python
Django and clientservertechnologiesbased applications Good experience in various
phases of SDLC Requirement Analysis Design Development and Testing onvarious
Development and Enhancement Projects Hands on experience in Agile Methodologies
Scrum stories and sprints experience in a Python basedenvironment Experience in
Object Oriented Design and Programming concepts using Python 3x and Java
Experience working on several Standard Python Packages like NumPy Pandas
Matplotlib PySide SciPywxPython PyTables etc Knowledge about setting up Python
REST API Framework using Django Experience working with Python ORM Libraries
including Django ORM Experience in implementing Model View Control MVC
architecture using serverside applications likeDjango Flask