## EMAIL PARSING

### For this project, I will be getting and parsing data from Gmail. Specifically, the Email Header.
### In order for this script to work perfecrtly, we will be using IMAP functionality of mail and imaplib of python. The following steps are necessary:
##### 1. Enable IMAP from Gmail settings
##### 2. Save changes
##### 3. Allow less secure app from Sign-in and Security

In [None]:
"""
    imaplib: is an Internet Message Access Protocol (IMAP) library
    email: is a python library that parses, handles and generates email messages.
    getpass: is a python library used to get password without echoing
    pandas: is a python library for data manipulation and analysis
    re: is a library that handles regular expression
    wordcloud: is used for visualization
    matplotlib: handles 2D plts
    pprint module helps pretty-print i.e fine-tune the output."""

# Import necessary Libraries
import imaplib 
import email
import getpass
import pandas as pd
import re
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from pprint import pprint

In [None]:
# Get username(Gmail account name) and password

username = input("Username: ")
password = getpass.getpass("Password: ")

In [None]:
# Start the server

mail = imaplib.IMAP4_SSL('imap.gmail.com')#EMAIL SERVER
mail.login(username, password)

In [None]:
# Get mail
# List all the the available mailboxes of the account

"""Note: mail.list returns a tuple i.e the response type and its data"""

response_type, mailboxes = mail.list()
pprint(mailboxes)

#select required mailbox
#mail.select('"[Gmail]/All Mail"')

mail.select() #or mail.select("INBOX") since Inbox is the default

In [None]:
# Get unique id (uid) of the messages in the selected mailbox

result, numbers = mail.uid('search', None, "ALL")
""" ALL/UNSEEN for all the messages or unread messages.."None" because the charset is not specified"""

uids = numbers[0].split() #numbers is a list of space separated string

#default is from oldest to newest; hence to access first 100 mail (newest) use the code below
latest_mails_uids = uids[-1:-101:-1]

# Decode each of the latest_mails_uids
decode_uids = [id.decode("utf-8") for id in latest_mails_uids]



In [None]:
# Let's fetch the Header of the mail based on the uids

"""Fetches can include the entire email body, or 
    any combination of results such as email flags (seen/unseen) 
    or gmail specific IDs"""

# Note: to fetch the header key (subject and date) of the mail use (BODY[HEADER.FIELDS (DATE SUBJECT)]])')
#To prevent messsages that has not bn read from opening use '(BODY.PEEK[...])'

"""result, messages = mail.uid('fetch', ','.join(decode_uids), '(BODY[])') #RFC822"""

result, messages = mail.uid('fetch', ','.join(decode_uids), '(BODY.PEEK[HEADER.FIELDS (DATE FROM SUBJECT)])') #RFC822

In [None]:
date_list = []
from_list = []
subject_list = []


for response in messages:
    if isinstance(response, tuple):
        """parse a bytes email into a message object"""
        msg = email.message_from_bytes(response[1])
        
        #get the header details
        subject_list.append(msg.get('subject'))
        date_list.append(msg.get('date'))
        from_list.append(msg.get('from'))

            
print(len(date_list))
print(len(from_list))
print(len(subject_list))

date_list = pd.to_datetime(date_list)
 
#Capture the sender email 
sender_mail = []
for i in from_list:
    extract_mail = re.findall("<\w+@\w+.\w+>", i)
    extract_mail = [s.strip("<>") for s in extract_mail]
    extract_mail = "".join(extract_mail[:])
    if len(extract_mail) == 0:
        sender_mail.append("Via Linkedin")
    else:
        sender_mail.append(extract_mail)

        
#capture sender name
sender_name = []
for i in from_list:
    extract_name = re.findall("\w+[^<\w+@\w+.\w+>]", i)
    extract_name = [s.strip("[\"viaLinkedInmessaging-gest-rco]") for s in extract_name]
    extract_name = "".join(extract_name[:])

    sender_name.append(extract_name)
    
df = pd.DataFrame(data ={"Date": date_list, "Sender Name": sender_name, "Sender Email": sender_mail, "Subject": subject_list})
df["Date"] = df['Date'].apply(lambda x:x.date())
# to capture time alone..use .time()


df.to_csv('inbox_email.csv',index=False)
df.head(10) 

In [None]:
#To access the statistical data of the dataframe use describe()
df.describe()

In [None]:
#plot the wordcloud 

text = ""
for item in df["Subject"]:
    if isinstance(item, str):
        text += " " + item

wordcloud = WordCloud(background_color = "white",
                      mask = None,
                      colormap = "nipy_spectral_r",
                      stopwords = STOPWORDS).generate(text)

# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
#save the plotted wordcloud
wordcloud.to_file("emailCloud.png")

#### ...and viola, we've been able to generate our data and plot the keywords in a wordcloud. However, there's no limit to what you can do with these data. It is all yours!! Your world to explore!!! 

##### Thanks for your time. Stay Blessed