# Extracting text from email messages

In [1]:
import os
import extract_msg

def get_unique_filename(directory, filename):
    """
    Returns a unique filename within the specified directory.
    If filename exists, appends a number to the base name.
    """
    base, ext = os.path.splitext(filename)
    candidate = filename
    counter = 1
    while os.path.exists(os.path.join(directory, candidate)):
        candidate = f"{base}_{counter}{ext}"
        counter += 1
    return candidate

def convert_msg_to_txt_with_attachments(input_path, output_path):
    """
    Reads a .msg file from input_path, writes the email content to output_path as a text file,
    and extracts any attachments to an "attachments" folder in the same directory as output_path.
    
    If an attachment file already exists, a number is appended to its filename.
    
    Parameters:
      input_path (str): The file path to the .msg file.
      output_path (str): The file path where the output .txt file will be saved.
    """
    try:
        # Open the .msg file
        msg = extract_msg.Message(input_path)
        
        # Extract key details with defaults if missing
        subject = msg.subject or "No Subject"
        sender = msg.sender or "Unknown Sender"
        recipients = msg.to or "Unknown Recipients"
        body = msg.body or "No Content"
        
        # Format the output text
        output_text = (
            f"Subject: {subject}\n"
            f"From: {sender}\n"
            f"To: {recipients}\n\n"
            "Body:\n"
            f"{body}\n"
        )
        
        # Write the formatted text to the output file
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(output_text)
        
        print(f"Converted: {input_path} -> {output_path}")
        
        # Extract attachments if available
        attachments = msg.attachments
        if attachments:
            # Define the attachments directory (a folder named "attachments" in the same directory as output_path)
            base_dir = os.path.dirname(output_path)
            attachments_dir = os.path.join(base_dir, "attachments")
            os.makedirs(attachments_dir, exist_ok=True)
            
            for attachment in attachments:
                # Use longFilename if available, otherwise shortFilename
                att_filename = attachment.longFilename or attachment.shortFilename
                # Get a unique filename in the attachments directory
                unique_filename = get_unique_filename(attachments_dir, att_filename)
                att_path = os.path.join(attachments_dir, unique_filename)
                with open(att_path, 'wb') as att_file:
                    att_file.write(attachment.data)
                print(f"Attachment saved: {att_path}")
        else:
            print("No attachments found.")
    
    except Exception as e:
        print(f"Error converting {input_path}: {e}")

def convert_all_msg_in_directory(directory):
    """
    Converts all .msg files in the specified directory to .txt files,
    while extracting attachments to a shared "attachments" folder in that directory.
    
    Parameters:
      directory (str): The directory containing .msg files.
    """
    for filename in os.listdir(directory):
        if filename.lower().endswith('.msg'):
            input_path = os.path.join(directory, filename)
            output_filename = os.path.splitext(filename)[0] + ".txt"
            output_path = os.path.join(directory, output_filename)
            convert_msg_to_txt_with_attachments(input_path, output_path)

In [2]:
msg_directory = "..\data"  
convert_all_msg_in_directory(msg_directory)

Converted: ..\data\Asuntos a discutir con Jamill Muriente.msg -> ..\data\Asuntos a discutir con Jamill Muriente.txt
Attachment saved: ..\data\attachments\image001_54.png
Attachment saved: ..\data\attachments\FW RENTA PROYECTO T-0467-1-70-00.msg_2.zip
Attachment saved: ..\data\attachments\RE RENTA PROYECTO T-0467-1-70-00.msg_2.zip
Attachment saved: ..\data\attachments\Carta contrato.msg_1.zip
Attachment saved: ..\data\attachments\Febrero 2021.pdf_1.zip
Converted: ..\data\Contrato arrendamiento.msg -> ..\data\Contrato arrendamiento.txt
Attachment saved: ..\data\attachments\image002_55.png
Attachment saved: ..\data\attachments\image006_44.png
Attachment saved: ..\data\attachments\image007_38.png
Attachment saved: ..\data\attachments\image008_26.png
Attachment saved: ..\data\attachments\image009_42.png
Converted: ..\data\Contrato con PRIDCO_ edificio Lote 5.msg -> ..\data\Contrato con PRIDCO_ edificio Lote 5.txt
Error converting ..\data\Contrato con PRIDCO_ edificio Lote 5.msg: expected st

In [3]:
# now take all the .txt files in the directory and combine them into one big file.

def combine_txt_files(directory, output_file):
    """
    Combines all .txt files in the specified directory into a single output file.
    
    Parameters:
      directory (str): The directory containing .txt files.
      output_file (str): The file path where the combined content will be saved.
    """
    with open(output_file, 'w', encoding='utf-8') as outfile:
        for filename in os.listdir(directory):
            if filename.lower().endswith('.txt'):
                input_path = os.path.join(directory, filename)
                with open(input_path, 'r', encoding='utf-8') as infile:
                    outfile.write(infile.read())
                    outfile.write("\n")  # Add a newline between files
                print(f"Combined: {input_path} -> {output_file}")

msg_directory = "..\data"  
combine_txt_files(msg_directory, os.path.join(msg_directory, "combined_texts.txt"))

Combined: ..\data\Asuntos a discutir con Jamill Muriente.txt -> ..\data\combined_texts.txt
Combined: ..\data\combined_texts.txt -> ..\data\combined_texts.txt
Combined: ..\data\Contrato arrendamiento.txt -> ..\data\combined_texts.txt
Combined: ..\data\Contrato con PRIDCO_ edificio Lote 5.txt -> ..\data\combined_texts.txt
Combined: ..\data\Cubierta de seguros.txt -> ..\data\combined_texts.txt
Combined: ..\data\CUENTA F7382 Gasco - Incentivo__Deuda.txt -> ..\data\combined_texts.txt
Combined: ..\data\dan lee y aprueba_ _)__le voy a copiar a todos y a Emil.txt -> ..\data\combined_texts.txt
Combined: ..\data\EMAILS SRA_ CELESTE.txt -> ..\data\combined_texts.txt
Combined: ..\data\Fwd_ Gasco Industrial- Necesidad de espacio de almacenaje y otros asuntos sobre reparaciones.txt -> ..\data\combined_texts.txt
Combined: ..\data\Fwd_ RENTA PROYECTO T-0467-1-70-00.txt -> ..\data\combined_texts.txt
Combined: ..\data\FW_ CUENTA F7382 Gasco - Incentivo__Deuda (52).txt -> ..\data\combined_texts.txt
Combi