# Email Manager and Slicer

This code will access your preferred email provider through their IMAP interface, fetch, manipulate, and slice up a target email into desired parts, defaulting to individual paragraphs.

**Note:** 
- This code is object-based, meaning there will be one parent object containing all the functionality.
- Credentials will have to be provided by the user and in case of emails with 2FA, special access maneuverability is required.


In [128]:
import imaplib
import email
import pandas as pd

# gmail imap server: imap.gmail.com:993
# -> imap = internet message access protocol
# gmail pop3 server: pop.gmail.com:995
# gmail smtp server: smtp.gmail.com:465 (fir SSL) or smtp.gmail.com:587 (For TLS)
# imap password app password google: "zneu ofac rlgk pneo"

class eSlice:
    def __init__(self, emailAddr = None, passwd = None, server="imap.gmail.com", port = 993):        
        # setup phase
        self._EMAIL_CREDENTIALS = {}
        self._POPULATE_CREDENTIALS(emailAddr, passwd)
              
        # Emails Data Frame
        self.indexesDF = pd.DataFrame(columns=["BID", "SUBJECT", "EMAIL"]) # where our data frame will be stored

        # create empty mail server variable
        self.mailServer = None
        
        # -> connection phase
        self.mailServer = self._CONNECT(server, port)
        
        # -> mail phase
        self._LOGIN()
        
    def _POPULATE_CREDENTIALS(self, emailAddr, passwd):
        # set environmental email value
        self._EMAIL_CREDENTIALS_DOT_ENV = self._GET_DOT_ENV() # get possible login credentials from .env file
        
        if emailAddr:
            self._EMAIL_CREDENTIALS["emailAddr"] = emailAddr
        else:
            print("loading environment default email")
            self._EMAIL_CREDENTIALS["emailAddr"] = self._EMAIL_CREDENTIALS_DOT_ENV["EMAIL"]
        
        # set global password value
        if passwd:
            self._EMAIL_CREDENTIALS["passwd"] = passwd
        else:
            print("loading environment default password")
            self._EMAIL_CREDENTIALS["passwd"] = self._EMAIL_CREDENTIALS_DOT_ENV["PASS"]
        
    def _GET_DOT_ENV(self):
        credentials = {}
        with open(".env") as f:
            content = f.read()
            # Parse the content into key-value pairs
            for line in content.splitlines():
                key, value = line.split("=", 1)
                # check and enter values
                credentials[key] = value
        
        return credentials
          
    def _POPULATE_MAIL(self):
        for i, row in self.indexesDF.iloc.iterrows():
            _, tempMail = self.mailServer.fetch(row['BID'])
            row["EMAIL"] = tempMail
    
    def _APPEND_DF(self, DF: pd.DataFrame, value: list) -> None:
        # adds an email to index position in data frame
        DF.loc[len(self.indexesDF.index)]  = value
        
    def _CONNECT(self, server, port):
        # there should be some error handling here for when there
        # is wrong data type entered
        connection = imaplib.IMAP4_SSL(server, port)
        return connection
    
    def _LOGIN(self):
        if self.mailServer and self._EMAIL_CREDENTIALS != {}:
            print("connection successful", )
            self.mailServer.login(self._EMAIL_CREDENTIALS["emailAddr"], self._EMAIL_CREDENTIALS["passwd"])
        
    def _MAIL_INDEX(self, inbox: str = "Inbox", lastN:int = 200):
        """
        This function generates a list of all emails in a server inbox
        example cases include: Inbox, Drafts, Spam, Etc...

        Args:
            inbox (str, optional):  the name of the inbox to be searched. 
                                    It defaults to "Inbox".
        """
                
        if self.mailServer:
            self.mailServer.select(inbox)
            _, mailIndex = self.mailServer.search(None, "All")
            
            # _, mailIndex = self.mailServer.search(
            #     None, 
            #     f"SENT (SINCE \"{datetime.date.today() - datetime.timedelta(days=1)}\" BEFORE \"{datetime.date.today()}\")"
            #     )
            
            # _, mailIndex = self.mailServer.search(
            #     None, 
            #     f"CHARSET UTF-8 UID {lastN}$:"
            #     )
                        
            # -> mailIndex is in a byte string format, this means we 
            # -> have to convert it into individual byte index values
            
            for index in mailIndex[0].split():
                # self.indexesDF.loc[len(self.indexesDF.index)]  = [None, index]
                self._APPEND_DF(self.indexesDF, [index, None, None])
                            
            self.Mail_Indexes = self.indexesDF
            
        return self.indexesDF    
        
    def getEmailsFromLatest(self, lastN:int = 10):
        emails = []
        for _, row in self.indexesDF.iloc[-1:-lastN-1:-1].iterrows():
            _, tempMail = self.mailServer.fetch(row['BID'], "(RFC822)")
            _email = email.message_from_bytes(tempMail[0][1])
            row["EMAIL"] = _email
            # mailId = _email.get("Message-ID")[1:13]
            mailId = row["BID"]
            emails.append([mailId, _email])
        return emails
    
    def getEmailRange(self, fromVal: int, toVal: int):
        emails = []
        for i, row in self.indexesDF.iloc[fromVal-1:toVal].iterrows():
            # print(f"{row['BID']}\n")
            _, tempMail = self.mailServer.fetch(row['BID'], "(RFC822)")
            _email = email.message_from_bytes(tempMail[0][1])
            row["EMAIL"] = _email
            # mailId = _email.get("Message-ID")[1:13]
            mailId = row["BID"]
            emails.append([mailId, _email])
        return emails
    
    def emailById(self, ID, latestN: int=0):
        # -> using linear search here. can be replaced with options like
        # -> binary search
        self.getEmailsFromLatest(latestN)
        for i, row in self.indexesDF.iloc[-1:-latestN-1:-1].iterrows():
            if row["BID"] == ID:
                return [ID, row["EMAIL"]]
        print("Invalid ID")
    
    def searchExtractText(self, ID=b'1', searchRange=100):
        _email = self.emailById(ID, searchRange)
        
        parts = []
        
        for part in _email[1].walk():
            if part.get_content_type() == "text/plain":
                parts.append(part)
                
        if parts != []:
            return parts
        else:
            print("no plain text parts")
            
    def extractText(self, _email):
        
        parts = []
        
        for part in _email[1].walk():
            if part.get_content_type() == "text/plain":
                parts.append(part)
                
        if parts != []:
            return parts
        else:
            print("no plain text parts")
           
    def extractMultiText(self, lastN: int=10):
        emails = self.getEmailsFromLatest(lastN) # get last 10 emails'
        messages = {}
        for mail in emails:
            mailId = mail[0]
            for part in mail.walk():
                if part.get_content_type() == "text/plain":
                    if mailId not in messages.keys():
                        messages[mailId] = list()
                    messages[mailId].append(part)
        return messages
    
    def mailSlice(self, eMail, separator: str = "\n\n"):
        pars = eMail.as_string().split(separator)
        return pars
    
    def output(self, pars):
        # outputs sliced email to text file
        # create text file
        # write to text file        
        with open('output.txt', 'w') as f:
            f.write("\n___________________\n".join(pars))
    
    # -> multi output is possible, but nor used here
    
    def end(self):
        self.mailServer.close()
        

The above code has a structure that follows privatization practices. there are two type of functions
1. internal methods
2. external methods

## Internal Methods:
These methods are used my the object for the object to abstract some of the internally repeated code.

In the Above code, these are:
* _POPULATE_CREDENTIALS
    - this is used to check weather the email address and password has been provided by the user and if not </br> will check local environment variable for said information
* _GET_DOT_ENV
    - this is just a short script to handle the retrieval and conversion of data from the **.env** file (In this case, the email and passwords)
* _CONNECT
    - this is one of the hallmark functions. It's purpose is to connect and to a provided IMAP (Internet Messaging Access Protocol) services.
    - These include but are not limited to:
        + server interfacing
        + server querying and searching
        + indexing and data mapping
    - In this instance we use it to form a connection object for retrieving emails from the server
* _APPEND_DF
    - this function replaces a deprecated functionality of pandas library where we can append data to the end of the DataFrame.
    - it uses a non iterative or duplicating method, so its relatively fast to the deprecated version
    - we use it here to manipulate and build on email Data Frames (Data Frames with email objects)
* _LOGIN
    - this is a function used to handle the security aspect of the login process. Due to time limitations,</br> the communication here is happening on a non secure and unencrypted connection
    - is in some ways an extension of the _POPULATE_CREDENTIALS Function
* _MAIL_INDEX
    - this function is used to index the incoming emails and assign unique ID values to them while also fetching some pertinent information like the Email Subject and Byte ID values.
    - This function is the primary interface between our connection and _CONNECT function to our Data structure and pandas Data Frame

## External Methods:
These are used to give functionality to the user associated with the object.
These Functions are:
* getEmailsFromLatest():
    - This method access the interface connection and retrieves the last **N** Number of emails from the server
    - in order to reduce run times, this is also when the contents of said emails is loaded into the local Data Frame **(obj.indexesDF)**.
    - it returns an array object, with arrays of ID and value pairs: **[ID, EMAIL]**
* getEmailRange():
    - this Does a similar task as the above **getEmailsFromLatest()** function, but instead of retrieving the Last N emails, it retrieves the emails from a range **N** to **M**
    - The output and functionality after retrieval remain the same
* emailById():
    - this is similar to the above two, but instead of retrieving a list of values, this only retrieves a single email. for this to work, the ID of said email will have to be determined before hand through one of the above two methods, or through searching the Data Frame Object. once the ID is known, Just insert it as a parameter, and run the function with a range of values to search fot the email.
    - this function is so that in case we cant find said email via ID in the DataFrame, we retrieve and search a broader number of emails and localize a single email for processing
    - the output, instead of being a list of lists, it is a single list with the ID and EMAIL: **[ID, EMAIL]**
    **NOTE:** This method uses the **getEmailsFromLatest()** function to broaden and narrow the search
* extractText():
    - This function is used to extract Text from an email parameter, stripping away all the Header Information and all the other non essential parts for this project, like images HTML and other noise code.
    - Returns sections of text in an array from the parts of the email that are text only
* searchExtractText():
    - This function works like the above, but takes in an email ID instead of an Email object. this allows for a combined functionality of searching, converting and extracting from email into a list of text sections
* extractMultiText():
    - this function is like the above two methods, but differs that it operates on multiple emails at once. extracting their text sections (parts) all at once and storing it into a new list of lists
* mailSlice():
    - this is the final step in converting emails into paragraph (sepprarator can be changed) units. it takes in a list of parts created from one of the first two methods above **(extractText() and searchExtractText())** and converts it into separate paragraphs
* output():
    - this is an added step to make output more user friendly by writing the output to a line separated text file.
* end():
    - this function is for closing the connection to the mail server completing the use of the object as well

# Notes:
- Several things to note are that this code is far from comprehensive nor is it algorithmically the best solution. But Keeping this code relatively short as well as somewhat simple in terms of understanding, requires use of relatively brute force code, relying more on processing and memory and less on logic. 
- some of the more obvious possible changes have been indicated with comments starting with "->"

In [None]:
# emObj = eSlice("please_add_email_here", "please add password here") # Optional add a host and port value here as well
# Or you can add password to the .env file and uncomment the following in place of the above line
emObj = eSlice()
indexedObj = emObj._MAIL_INDEX()
emObj.getEmailsFromLatest(100) # will give you the last 100 emails and their ID's
data = emObj.searchExtractText(b'please add the binary ID (int value) of the email from the above step here', 100)

emObj.output(emObj.mailSlice(data[0]))

The above code contains some use examples for the primary code

The Below sections explore some additional use cases and separations possible in email files

<h3 style="color:red">FOLLOWING CODE IS NON FUNCTIONAL RIGHT NOW AND IS HERE FOR DEMO ONLY</h3>

In [None]:
# ! THIS CODE IS NON FUNCTIONAL RIGHT NOW AND IS HERE FOR DEMO ONLY
from pprint import pprint as pp
print(messages[2].get("From"))
print(messages[2].get("To"))
print(messages[2].get("Bcc"))
print(messages[2].get("Date"))
print(messages[2].get("Subject"))

# for part in messages[0].walk():
#     print
#     if part.get_content_type() == "text/plain":
#         print(part.as_string())