In [None]:
def process_cv(docx_path: str) -> Dict:
    """
    Process a DOCX CV and extract structured information.
    Args:
        docx_path (str): Path to the DOCX file
    Returns:
        Dict: Dictionary containing structured CV information
    """
    doc = Document(docx_path)
    print(doc.paragraphs.)
    cv_data = {}
    current_section = None
    section_content = []
    
    # Headers we want to identify (in order they appear in the document)
    section_headers = {
        'EDUCATION': 'education',
        'SKILLS': 'skills',
        'PROFESSIONAL EXPERIENCE AND INTERNSHIPS': 'experience',
        'PROJECTS': 'projects',
        'RESEARCH PAPERS': 'research_papers',
        'ACHIEVEMENTS': 'achievements'  # Added achievements section
    }

    # Extract contact information from the first paragraph
    first_para = doc.paragraphs[0].text.strip()
    
    # Parse contact info
    contact_parts = first_para.split('|')
    contact_info = {
        'email': contact_parts[0].strip(),
        'phone': contact_parts[1].strip(),
        'linkedin': contact_parts[2].strip(),
        'github': contact_parts[3].strip(),
        'portfolio': contact_parts[4].strip(),
        'kaggle': contact_parts[5].strip(),
        'tableau': contact_parts[6].strip() if len(contact_parts) > 6 else None
    }
    cv_data['contact_info'] = contact_info

    # Process each paragraph to extract sections
    for para in doc.paragraphs:
        text = para.text.strip()
        if not text:  # Skip empty paragraphs
            continue

        # Check if this is a section header
        is_header = False
        for header, section_name in section_headers.items():
            if header in text.upper():
                # If we were building a previous section, save it
                if current_section and section_content:
                    cv_data[current_section] = '\n'.join(section_content)
                # Start new section
                current_section = section_name
                section_content = []
                is_header = True
                break

        # If not a header and we're in a section, add to content
        if not is_header and current_section:
            # Skip the section header itself
            if text.upper() not in section_headers:
                section_content.append(text)

    # Add the last section
    if current_section and section_content:
        cv_data[current_section] = '\n'.join(section_content)

    # Post-process skills section if it wasn't properly captured
    if 'skills' not in cv_data or not cv_data['skills'].strip():
        # Look for skills section specifically
        skills_text = []
        found_skills = False
        for para in doc.paragraphs:
            text = para.text.strip()
            if 'SKILLS' in text.upper():
                found_skills = True
                continue
            elif found_skills and any(header in text.upper() for header in section_headers.keys()):
                break
            elif found_skills and text:
                skills_text.append(text)
        if skills_text:
            cv_data['skills'] = '\n'.join(skills_text)

    # Ensure achievements are captured
    if 'ACHIEVEMENTS' not in cv_data:
        achievements_text = []
        found_achievements = False
        for para in doc.paragraphs:
            text = para.text.strip()
            if 'ACHIEVEMENTS' in text.upper():
                found_achievements = True
                continue
            elif found_achievements and text and not any(header in text.upper() for header in section_headers.keys()):
                achievements_text.append(text)
        if achievements_text:
            cv_data['achievements'] = '\n'.join(achievements_text)

    return cv_data



In [132]:
def print_all_paragraphs(docx_path: str):
    """
    Print all paragraphs from the DOCX file with their index and text
    """
    doc = Document(docx_path)
    
    print("\nAll Paragraphs in Document:")
    print("="*50)
    for idx, paragraph in enumerate(doc.paragraphs):
        if paragraph.text.strip():  # Only print non-empty paragraphs
            print(f"[{idx}] {paragraph.text}")
            print("-"*50)

In [133]:
docx_path1="./Faseeh Resume GA11.docx"

In [139]:
def print_all_paragraphs(docx_path: str):
    """
    Print all paragraphs from the DOCX file with their index and text
    """
    doc = Document(docx_path)
    
    print("\nAll Paragraphs in Document:")
    print("="*50)
    for idx, paragraph in enumerate(doc.paragraphs):
        if paragraph.text.strip():  # Only print non-empty paragraphs
            print(f"[{idx}] {paragraph.text}  ")
            if paragraph.hyperlinks:
                    for hyperlink in paragraph.hyperlinks:
                        print(f"Text: {hyperlink.text}")
                        print(f"URL: {hyperlink.url}")
            print("-"*50)

In [141]:
ddf=print_all_paragraphs(docx_path)
print(ddf)


All Paragraphs in Document:
[0]                             Mohammad Faseeh Ahmed  
--------------------------------------------------
[1] mm9314@g.rit.edu | +1 585 202 5217 | LinkedIn | Github | Portfolio | Kaggle | Tableau   
Text: mm9314@g.rit.edu
URL: mailto:mm9314@g.rit.edu
Text: LinkedIn 
URL: https://www.linkedin.com/in/mohammad-faseeh-ahmed/
Text: Github
URL: https://github.com/faseehahmed26
Text: Portfolio
URL: https://faseehahmed26.github.io/portfolio/
Text: Kaggle
URL: https://www.kaggle.com/faseeh001
Text: Tableau
URL: https://public.tableau.com/app/profile/faseeh5112
--------------------------------------------------
[3] EDUCATION  
--------------------------------------------------
[5] Rochester Institute of Technology, Rochester, NY,  M.S in Data Science	      Expected May 2025  
--------------------------------------------------
[6] Coursework: Neural Networks, Software Engineering for Data Science, Applied Statistics.                         GPA: 3.84/4.00  
---------

In [137]:
def extract_static_content(doc: Document) -> Dict:
    """Extract static content (header info) with hyperlinks"""
    static_content = {
        'contact_info': {
            'email': '',
            'phone': '',
            'links': {}
        }
    }
    
    # Process first paragraph for contact info
    if doc.paragraphs[0].hyperlinks:
        print(doc.paragraphs[0].text)
        for hyperlink in doc.paragraphs[0].hyperlinks:
            if '@' in hyperlink.text:
                static_content['contact_info']['email'] = hyperlink.text
            elif any(x in hyperlink.text.lower() for x in ['linkedin', 'github', 'portfolio', 'kaggle', 'tableau']):
                static_content['contact_info']['links'][hyperlink.text.lower()] = hyperlink.address
            elif any(char.isdigit() for char in hyperlink.text):
                static_content['contact_info']['phone'] = hyperlink.text
    try:
        if static_content['contact_info']['phone'] =='':
            print(doc.paragraphs[0].text.split("|"))
            static_content['contact_info']['phone']=[s.strip() for s in doc.paragraphs[0].text.split("|")  if re.search(r'\b(?:\+?\d[\d\s-]{9,})\b', s)][0]
    except:
        print("No Number")
    return static_content

In [128]:
 extract_static_content(doc)

mm9314@g.rit.edu | +1 585 202 5217 | LinkedIn | Github | Portfolio | Kaggle | Tableau 
['mm9314@g.rit.edu ', ' +1 585 202 5217 ', ' LinkedIn ', ' Github ', ' Portfolio ', ' Kaggle ', ' Tableau ']


{'contact_info': {'email': 'mm9314@g.rit.edu',
  'phone': '+1 585 202 5217',
  'links': {'linkedin ': 'https://www.linkedin.com/in/mohammad-faseeh-ahmed/',
   'github': 'https://github.com/faseehahmed26',
   'portfolio': 'https://faseehahmed26.github.io/portfolio/',
   'kaggle': 'https://www.kaggle.com/faseeh001',
   'tableau': 'https://public.tableau.com/app/profile/faseeh5112'}}}

In [None]:
prompts_dict = {
    'skills': {
        'system': """You are a resume skill optimizer that matches candidate skills with job requirements.""",
        'user': """Given these skills categories and the job description, return only the most relevant skills 
                  organized in the same categories. Limit to 2-3 lines per category.
                  
                  Format the output as a dictionary with categories as keys and lists of skills as values.
                  
                  Skills: {skills}
                  Job Description: {job_description}"""
    },
    'experience': {
        'system': """You are an expert resume writer specializing in the STAR method.""",
        'user': """Convert these experience points into 2-4 powerful bullet points relevant to the job description.
                  Each bullet should:
                  - Start with a strong action verb
                  - Include specific technologies used
                  - Show measurable impact (%, metrics)
                  - Be relevant to the job requirements
                  
                  Experience: {experience}
                  Job Description: {job_description}
                  
                  Format output as a list of bullets."""
    },
    'projects': {
        'system': """You are a technical project curator for resumes.""",
        'user': """Select and optimize 2 most relevant projects for this job description.
                  For each project, provide 2-3 technical bullet points that:
                  - Highlight relevant technologies
                  - Show technical complexity
                  - Include measurable outcomes
                  
                  Projects: {projects}
                  Job Description: {job_description}
                  
                  Format as a dictionary with project names as keys and bullet lists as values."""
    }
}

In [122]:
strings=['mm9314@g.rit.edu ', ' +1 585 202 5217 ', ' LinkedIn ', ' Github ', ' Portfolio ', ' Kaggle ', ' Tableau ']
phone_numbers = [s.strip() for s in strings if re.search(r'\b(?:\+?\d[\d\s-]{9,})\b', s)]
print(phone_numbers)


['+1 585 202 5217']


In [None]:

docx_path = "./Faseeh Curriculum Vitae.docx"
cv_data = process_cv(docx_path)

# Print extracted sections
for section, content in cv_data.items():
    print(f"\n{'='*50}\n{section.upper()}:\n{'='*50}")
    if isinstance(content, dict):  # For contact info
        for key, value in content.items():
            print(f"{key}: {value}")
    else:
        print(content)

In [None]:
from docx import Document
from docx.opc.constants import RELATIONSHIP_TYPE as RT

In [None]:
from docx import Document
from docx.shared import Pt, Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH
from typing import Dict

def create_resume_docx(cv_data: Dict, output_path: str = 'generated_resume.docx'):
    """
    Create a DOCX resume from structured CV data
    Args:
        cv_data (Dict): Dictionary containing CV sections
        output_path (str): Path to save the generated DOCX
    """
    doc = Document()
    
    # Set up document margins
    sections = doc.sections
    for section in sections:
        section.top_margin = Inches(0.5)
        section.bottom_margin = Inches(0.5)
        section.left_margin = Inches(0.5)
        section.right_margin = Inches(0.5)

    # Add contact information
    contact_info = cv_data.get('contact_info', {})
    if 'name' in contact_info:
        name_paragraph = doc.add_paragraph()
        name_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
        name_run = name_paragraph.add_run(contact_info['name'])
        name_run.bold = True
        name_run.font.size = Pt(12)

    # Function to add a section
    def add_section(title: str, content: str):
        # Add section header
        header = doc.add_paragraph()
        header_run = header.add_run(title)
        header_run.bold = True
        header_run.font.size = Pt(12)
        
        # Add section content
        if content:
            content_paragraph = doc.add_paragraph()
            content_run = content_paragraph.add_run(content)
            content_run.font.size = Pt(10)

    # Add each section
    sections_order = [
        ('EDUCATION', cv_data.get('education', '')),
        ('SKILLS', cv_data.get('skills', '')),
        ('PROFESSIONAL EXPERIENCE AND INTERNSHIPS', cv_data.get('experience', '')),
        ('PROJECTS', cv_data.get('projects', '')),
        ('RESEARCH PAPERS', cv_data.get('research_papers', ''))
    ]

    for title, content in sections_order:
        add_section(title, content)

    # Save the document
    doc.save(output_path)
    return output_path


    

In [None]:
from docx import Document

def extract_paragraphs_with_hyperlinks(docx_path):
    doc = Document(docx_path)
    paragraphs = []

    for para in doc.paragraphs:
        # Extract the full paragraph text
        full_text = para.text
        hyperlinks = []

        # Iterate through the runs to find hyperlinks
        for run in para.runs:
            if run.hyperlink:
                hyperlink = run.hyperlink
                display_text = run.text
                url = hyperlink.target_ref
                hyperlinks.append((display_text, url))

        paragraphs.append((full_text, hyperlinks))

    return paragraphs

def format_paragraphs(paragraphs):
    formatted_output = []
    for idx, (text, hyperlinks) in enumerate(paragraphs):
        formatted_output.append(f"[{idx}] {text}")
        for display_text, url in hyperlinks:
            formatted_output.append(f"Text: {display_text}")
            formatted_output.append(f"URL: {url}")
        formatted_output.append("--------------------------------------------------")
    return "\n".join(formatted_output)

# Usage
paragraphs = extract_paragraphs_with_hyperlinks(docx_path)
formatted_text = format_paragraphs(paragraphs)
print(formatted_text)


In [None]:
!pip install docxpy 
import docxpy



In [None]:
doc = docxpy.DOCReader(docx_path)
doc.process()  # Process the document to extract data


In [None]:
doc.data[ 'document']

In [None]:
# Extract hyperlinks
hyperlinks = doc.data['links']

# Extract paragraphs
paragraphs = doc.data[ 'document']

# Function to format and print the document content
def print_document_content(paragraphs, hyperlinks):
    print("All Paragraphs in Document:")
    print("=" * 50)

    for i, para in enumerate(paragraphs):
        # Print paragraph index and content
        print(f"[{i}] {para}")
        print("-" * 50)

        # Check if the paragraph contains any hyperlinks
        if i in hyperlinks:
            for link_text, url in hyperlinks[i]:
                print(f"Text: {link_text}")
                print(f"URL: {url}")
            print("-" * 50)

# Call the function to print the content


In [61]:
doc = Document(docx_path)
extracted_text = []
# print(doc.sections)
# Extract text from the header (if present)
for section in doc.sections:
    header = section.header
#         print(header)
    for paragraph in header.paragraphs:

        if paragraph.text.strip():
            extracted_text.append(paragraph.text)

# Extract text from the body
for paragraph in doc.paragraphs:
    if paragraph.text.strip():
        extracted_text.append(paragraph.text)

# Combine and return the text
full_text = "\n".join(extracted_text)

In [68]:
doc.

AttributeError: 'Document' object has no attribute 'title'

In [63]:
# Extract text from the body
for paragraph in doc.paragraphs:
    if paragraph.text.strip():
        print(paragraph.text)

mm9314@g.rit.edu | +1 585 202 5217 | LinkedIn | Github | Portfolio | Kaggle | Tableau 
EDUCATION
Rochester Institute of Technology, Rochester, NY,  M.S in Data Science	      Expected May 2025
Coursework: Neural Networks, Software Engineering for Data Science, Applied Statistics.                         GPA: 3.84/4.00
Jawaharlal Nehru Technological University Hyderabad, B.Tech in Computer Science	 July 2018 - July 2022
Coursework: Data Structures and Algorithms, Computer Vision, Artificial Intelligence, NLP 	         GPA: 3.2/4.00
SKILLS
Programming Languages: Java, Python, C++, R, JavaScript, Object Oriented Programming(Python, Java)
Frameworks: PyTorch, Keras, Scikit, Tensorflow, Groovy, PySpark, Flask, React, NodeJS,
Databases: SQL, MongoDB, SQLite, MySQL, NoSQL, PostgreSQL, DynamoDB
Technologies/Tools: JSON, Docker, Git, AWS, Kafka, GitLab, Numpy, Pandas, MLflow, Postman, Tableau, Power BI, MLOps
ML Algorithms and Techniques: Regression, Classification, Clustering, Recommender Syste

In [55]:
def iter_headings(paragraphs):
    for paragraph in paragraphs:
        if paragraph.style.name.startswith('Heading'):
            yield paragraph

for heading in iter_headings(doc.paragraphs):
    print(heading.text)

SKILLS
PROFESSIONAL EXPERIENCE AND INTERNSHIPS

PROJECTS
Research Papers
ACHIEVEMENTS


In [49]:

def extract_name_from_docx(docx_path: str):
    """
    Extract text, including headers, to find the name or specific information.
    """
    doc = Document(docx_path)
    extracted_text = []
    print(doc.sections
    # Extract text from the header (if present)
    for section in doc.sections:
        header = section.header
#         print(header)
        for paragraph in header.paragraphs:
            
            if paragraph.text.strip():
                extracted_text.append(paragraph.text)

    # Extract text from the body
    for paragraph in doc.paragraphs:
        if paragraph.text.strip():
            extracted_text.append(paragraph.text)

    # Combine and return the text
    full_text = "\n".join(extracted_text)
    return full_text

# Example usage
extracted_content = extract_name_from_docx(docx_path)
print(extracted_content)

# Extract the first non-empty line as the name (assumption-based)
lines = extracted_content.split("\n")
name = next((line.strip() for line in lines if line.strip()), "Name not found")
print(f"Extracted Name: {name}")


mm9314@g.rit.edu | +1 585 202 5217 | LinkedIn | Github | Portfolio | Kaggle | Tableau 
EDUCATION
Rochester Institute of Technology, Rochester, NY,  M.S in Data Science	      Expected May 2025
Coursework: Neural Networks, Software Engineering for Data Science, Applied Statistics.                         GPA: 3.84/4.00
Jawaharlal Nehru Technological University Hyderabad, B.Tech in Computer Science	 July 2018 - July 2022
Coursework: Data Structures and Algorithms, Computer Vision, Artificial Intelligence, NLP 	         GPA: 3.2/4.00
SKILLS
Programming Languages: Java, Python, C++, R, JavaScript, Object Oriented Programming(Python, Java)
Frameworks: PyTorch, Keras, Scikit, Tensorflow, Groovy, PySpark, Flask, React, NodeJS,
Databases: SQL, MongoDB, SQLite, MySQL, NoSQL, PostgreSQL, DynamoDB
Technologies/Tools: JSON, Docker, Git, AWS, Kafka, GitLab, Numpy, Pandas, MLflow, Postman, Tableau, Power BI, MLOps
ML Algorithms and Techniques: Regression, Classification, Clustering, Recommender Syst

In [None]:
def extract_text_from_docx(docx_path: str):
    """
    Extract text from the main body, headers, footers, and text boxes of a DOCX file.
    """
    doc = Document(docx_path)
    full_text = []

    # Extract text from headers
    for section in doc.sections:
        header = section.header
        for paragraph in header.paragraphs:
            if paragraph.text.strip():
                full_text.append(paragraph.text)

    # Extract text from the main body
    for paragraph in doc.paragraphs:
        if paragraph.text.strip():
            full_text.append(paragraph.text)

    # Extract text from footers
    for section in doc.sections:
        footer = section.footer
        for paragraph in footer.paragraphs:
            if paragraph.text.strip():
                full_text.append(paragraph.text)

    # Extract text from text boxes (inline shapes)
    for shape in doc.inline_shapes:
        if shape.type == 3:  # 3 corresponds to a text box
            if shape.text_frame:
                text = shape.text_frame.text
                if text.strip():
                    full_text.append(text)

    return '\n'.join(full_text)


In [None]:
extract_text_from_docx(docx_path)

In [None]:
doc = Document(docx_path)

title_size = max_size = 0
max_size_text = title = None
for p in doc.paragraphs:
    style = p.style
    if style is not None:
        if style.name == 'Title':
            title_size = style.font.size.pt
            title = p.text
            break
        size = style.font.size
        if size is not None:
            if size.pt > max_size:
                max_size = size.pt
                max_size_text = p.text

if title is not None:
    print(f"Title: size={title_size} text='{title}'")
else:
    print(f"max size title: size={title_size} text='{max_size_text}'")

In [None]:
import docx2txt

def extract_text_from_docx(docx_path: str) -> str:
    """
    Extracts and returns all text from the specified DOCX file.
    """
    try:
        # Extract text from the DOCX file
        text = docx2txt.process(docx_path)
        return text
    except Exception as e:
        print(f"An error occurred: {e}")
        return ""

In [None]:
extract_text_from_docx(docx_path)

In [None]:
pip install docx2txt

In [None]:

# Replace with your DOCX path
docx_path = "./Faseeh Curriculum Vitae.docx"

# Process the CV
cv_data = process_cv(docx_path)

# Print extracted sections
for section, content in cv_data.items():
    print(f"\n{'='*50}\n{section.upper()}:\n{'='*50}")
    print(content)

In [None]:
output_path = create_resume_docx(cv_data)
print(f"Resume generated at: {output_path}")

In [None]:
from docx import Document
from typing import Dict
import re

def process_cv(docx_path: str) -> Dict:
    """
    Process a DOCX CV and extract structured information.
    Args:
        docx_path (str): Path to the DOCX file
    Returns:
        Dict: Dictionary containing structured CV information
    """
    doc = Document(docx_path)
    cv_data = {}
    current_section = None
    section_content = []
    
    # Define section headers and their normalized names
    section_headers = {
        'EDUCATION': 'education',
        'SKILLS': 'skills',
        'PROFESSIONAL EXPERIENCE AND INTERNSHIPS': 'experience',
        'PROJECTS': 'projects',
        'RESEARCH PAPERS': 'research_papers',
        'ACHIEVEMENTS': 'achievements'
    }

    # Process contact information from first paragraph
    first_line = doc.paragraphs[0].text.strip()
    contact_parts = first_line.split(' | ')
    contact_info = {
        'email': contact_parts[0],
        'phone': contact_parts[1],
        'links': {
            'linkedin': 'LinkedIn',
            'github': 'Github',
            'portfolio': 'Portfolio',
            'kaggle': 'Kaggle',
            'tableau': 'Tableau'
        }
    }
    cv_data['contact_info'] = contact_info

    # Initialize all sections
    for section in section_headers.values():
        cv_data[section] = []

    # Track if we're in skills section to handle special formatting
    in_skills_section = False
    current_skill_category = None

    for para in doc.paragraphs:
        text = para.text.strip()
        if not text:
            continue

        # Check if this is a section header
        is_header = False
        for header, section_name in section_headers.items():
            if header in text.upper():
                current_section = section_name
                is_header = True
                in_skills_section = (section_name == 'skills')
                break

        if not is_header and text:
            if current_section == 'skills':
                # Handle skills section specially
                if ':' in text:
                    # This is a skill category
                    category, skills = text.split(':', 1)
                    cv_data['skills'].append({
                        'category': category.strip(),
                        'skills': [skill.strip() for skill in skills.split(',')]
                    })
            else:
                # Add content to current section
                if current_section:
                    cv_data[current_section].append(text)

    # Clean up research papers section
    if cv_data['research_papers']:
        papers = []
        current_paper = None
        for text in cv_data['research_papers']:
            if text.endswith(':'):
                if current_paper:
                    papers.append(current_paper)
                current_paper = {'title': text[:-1], 'description': ''}
            elif current_paper:
                current_paper['description'] = text
        if current_paper:
            papers.append(current_paper)
        cv_data['research_papers'] = papers

    # Clean up achievements section
    if cv_data['achievements']:
        # Remove the "ACHIEVEMENTS" header if it's in the list
        achievements = [ach for ach in cv_data['achievements'] if 'ACHIEVEMENTS' not in ach.upper()]
        cv_data['achievements'] = achievements

    return cv_data



In [None]:
# Example usage:
if __name__ == "__main__":
    docx_path = "./Faseeh Curriculum Vitae.docx"
    cv_data = process_cv(docx_path)
    
    # Print extracted data in a structured way
    for section, content in cv_data.items():
        print(f"\n{'='*50}\n{section.upper()}:\n{'='*50}")
        if isinstance(content, list):
            for item in content:
                if isinstance(item, dict):
                    print(f"\n{item}")
                else:
                    print(f"\n{item}")
        else:
            print(content)