In [1]:
# import necessary libraries
import imaplib
import email
from datetime import datetime, timedelta
import pandas as pd
import yaml
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
def get_imap_connection(cred_file):
    # load credentials from YAML file
    with open(cred_file, 'r') as f:
        credentials = yaml.safe_load(f)

    # connect to Gmail's IMAP server
    imap_host = 'imap.gmail.com'
    imap_user = credentials['username']
    imap_pass = credentials['password']
    imap = imaplib.IMAP4_SSL(imap_host)

    # login to your account
    imap.login(imap_user, imap_pass)

    return imap


In [3]:
def get_date_range(start_date, end_date):
    start_date = datetime.strptime(start_date, '%d-%b-%Y').date()
    end_date = datetime.strptime(end_date, '%d-%b-%Y').date()
    end_date += timedelta(days=1)  # to include emails from the end date
    date_cutoff = start_date.strftime('%d-%b-%Y')
    return date_cutoff, end_date

In [4]:
def search_emails(imap, date_cutoff):
    # select the mailbox you want to scrape
    mailbox = 'INBOX'
    imap.select(mailbox)

    # search for all emails in the selected mailbox within the specified date range
    search_criteria = f'(SINCE "{date_cutoff}")'
    status, response = imap.search(None, search_criteria)

    return response[0].split()

In [5]:
def extract_email_details(imap, email_id):
    # fetch the email by its ID
    status, response = imap.fetch(email_id, '(RFC822)')

    # extract the email content from the response
    email_content = response[0][1]
    email_message = email.message_from_bytes(email_content)

    # extract relevant email details
    email_from = email_message['From']
    email_to = email_message['To']
    email_subject = email_message['Subject']
    email_importance = email_message['Importance']
    email_body = ''

    # if the email has a plain text or HTML body, extract it
    for part in email_message.walk():
        if part.get_content_type() == 'text/plain':
            email_body = part.get_payload(decode=True).decode('iso-8859-1')
        elif part.get_content_type() == 'text/html':
            email_body = part.get_payload(decode=True).decode('iso-8859-1')

    return email_from, email_to, email_subject, email_importance, email_body

In [6]:
def extract_keywords(text):
    # tokenize the text into words
    tokens = word_tokenize(text.lower())

    # remove stop words and punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words]

    # count the frequency of each word
    freq = nltk.FreqDist(tokens)

    # extract the most common words as keywords
    keywords = list(freq.keys())[:5]

    return keywords

In [7]:
def scrape_emails(start_date, end_date, cred_file):
    imap = get_imap_connection(cred_file)
    date_cutoff, end_date = get_date_range(start_date, end_date)
    email_ids = search_emails(imap, date_cutoff)

    # create an empty DataFrame to store the extracted email details
    emails_df = pd.DataFrame(columns=['From', 'To', 'Subject', 'Importance', 'Body', 'Thread_ID'])

    # iterate over the list of email IDs returned by the search query
    for email_id in email_ids:
        # extract email details
        email_from, email_to, email_subject, email_importance, email_body = extract_email_details(imap, email_id)

        # extract thread ID from subject (if present)
        thread_id = None
        if isinstance(email_subject, str):
            match = re.search('\[([\w-]+)\]', email_subject)
            if match:
                thread_id = match.group(1)

        # extract keywords from email body
        keywords = extract_keywords(email_body)

        # append the extracted email details to the DataFrame
        emails_df = emails_df.append({'From': email_from,
                                      'To': email_to,
                                      'Subject': email_subject,
                                      'Importance': email_importance,
                                      'Body': email_body,
                                      'Thread_ID': thread_id,
                                      'Keywords': keywords}, ignore_index=True)

    # close the mailbox and logout of your account
    imap.close()
    imap.logout()

    return emails_df

In [9]:
# set the start and end dates of the range you want to scrape
start_date = '01-Jan-2018'
end_date = '31-Dec-2022'

# specify the path to your YAML credentials file
cred_file = '/content/credentials.yaml'

# scrape emails within the specified date range and store the results in a DataFrame
emails_df = scrape_emails(start_date, end_date, cred_file)

# save the DataFrame as a CSV file
emails_df.to_csv('emails.csv', index=False)

# print the first few rows of the DataFrame to verify that the scraping was successful
print(emails_df.head())


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  emails_df = emails_df.append({'From': email_from,
  emails_df = emails_df.append({'From': email_from,
  emails_df = emails_df.append({'From': email_from,
  emails_df = emails_df.append({'From': email_from,
  emails_df = emails_df.append({'From': email_from,
  emails_df = emails_df.append({'From': email_from,
  emails_df = emails_df.append({'From': email_from,
  emails_df = emails_df.append({'From': email_from,
  emails_df = emails_df.append({'From': email_from,
  emails_df = emails_df.append({'From': email_from,
  emails_df = emails_df.append({'From': email_from,
  emails_df = emails_df.append({'From': email_from,
  emails_df = emails_df.append({'From': email_from,
  emails_df = emails_df.append({'From': email_from,
  emails_df = emails_df.append({'From': email_from,
  emails_df = emails_df.append({'From': email_from,
  emails_df = emails_df.append({'From': email_from,
  emails_df = emails_df.append({'From': email_from,

                                                From  \
0                        donotreply.sbiatm@sbi.co.in   
1                         Paytm <no-reply@paytm.com>   
2                        NSE Academy <nse@nse.co.in>   
3   Standard chartered<newsletters@backendmails.com>   
4  "Wishfin" <newsletter@update.esteemnewsemail.com>   

                             To  \
0    abbasingapurwala@gmail.com   
1    abbasingapurwala@gmail.com   
2    abbasingapurwala@gmail.com   
3  <abbasingapurwala@gmail.com>   
4    abbasingapurwala@gmail.com   

                                             Subject Importance  \
0  Transaction alert for your State Bank of India...       None   
1  Your Recharge of Idea Mobile 8750501371 for Rs...       None   
2  Invitation for seminar on  "Live Trading strat...       None   
3  Hello abbasingapurwala@gmail.com , Save_Up to ...       None   
4                          A loan for all your needs       None   

                                                B