In [131]:
#Install libraries
!pip install --upgrade google-auth google-auth-oauthlib google-auth-httplib2 google-api-python-client




In [212]:
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from googleapiclient.discovery import build
from email.utils import parsedate_tz, mktime_tz
from datetime import datetime, timedelta
import base64
import os
import re
import csv


# Define the scope for the Gmail API
SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']


In [213]:
# 2 options to authenticate. 
# Authenticate and create the Gmail API service
def authenticate_gmail():
    creds = None
    # Check if token.json exists for previous authentication sessions
    if os.path.exists('token.json'):
        creds = Credentials.from_authorized_user_file('token.json', SCOPES)
    
    # If no valid credentials, perform the OAuth flow
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        # Save the credentials for future use
        with open('token.json', 'w') as token:
            token.write(creds.to_json())

    service = build('gmail', 'v1', credentials=creds)
    return service

# Authenticate with an access token
def authenticate_gmail_with_access_token(access_token):
    creds = Credentials(token=access_token)
    service = build('gmail', 'v1', credentials=creds)
    return service


In [None]:

# Search Gmail for emails containing specific keywords
def search_emails(service, keywords_or1, keywords_or2, yearsago):  
    # Calculate the date 5 years ago
    five_years_ago = datetime.now() - timedelta(days=yearsago*365) 
    five_years_ago_str = five_years_ago.strftime('%Y/%m/%d')

    # Combine the first list of keywords with OR
    query1 = '(' + ' OR '.join(keywords_or1) + ')' if keywords_or1 else ''
    # Combine the second list of keywords with OR
    query2 = '(' + ' OR '.join(keywords_or2) + ')' if keywords_or2 else ''
    
    # Combine the two queries with AND and add the 'after' filter for emails received in the last 5 years
    query = f'{query1} AND {query2} AND after:{five_years_ago_str}'  #if query1 and query2 else f'{query1} AND after:{five_years_ago_str}' or f'{query2} AND after:{five_years_ago_str}'
 
    messages = []
    page_token = None


    while True:
        # Request a page of messages
        results = service.users().messages().list(
            userId='me',
            q=query,
            pageToken=page_token,
            maxResults=500  # Gmail API maxes at 500, adjust if needed
        ).execute()
        
        # Add the messages from this page to the results
        messages.extend(results.get('messages', []))

        # Check if there's another page
        page_token = results.get('nextPageToken')
        if not page_token:
            break
    # Debug
    print(f"Query used for search: {query}")
    print(f"Page Token: {page_token}")
    print(f"Messages retrieved so far: {len(messages)}")

    return messages  # Return the accumulated messages


In [225]:
# Get the subject, body, sender address, and date of an email
def get_email_details(service, message_id):
    message = service.users().messages().get(userId='me', id=message_id, format='full').execute()
    payload = message.get('payload', {})
    headers = payload.get('headers', [])

    # Extract the subject
    subject = next((header['value'] for header in headers if header['name'].lower() == 'subject'), "")

    # Extract the sender address
    sender = next((header['value'] for header in headers if header['name'].lower() == 'from'), "")

    # Extract the date
    date_str = next((header['value'] for header in headers if header['name'].lower() == 'date'), "")
    date = ""
    if date_str:
        date_tuple = parsedate_tz(date_str)
        if date_tuple:
            timestamp = mktime_tz(date_tuple)
            date = datetime.fromtimestamp(timestamp).strftime('%m/%d/%Y')  # Convert to MM/DD/YYYY format

    # Extract the body
    parts = payload.get('parts', [])
    body = ""
    if 'data' in payload.get('body', {}):
        body += base64.urlsafe_b64decode(payload['body']['data']).decode('utf-8')
    for part in parts:
        if part.get('mimeType') == 'text/plain':
            body += base64.urlsafe_b64decode(part['body']['data']).decode('utf-8')
        elif part.get('mimeType') == 'text/html':
            body += base64.urlsafe_b64decode(part['body']['data']).decode('utf-8')

    return subject.strip(), body.strip(), sender.strip(), date



In [227]:
# Extract dates and countries from the email body
def extract_dates_from_body(body):
    date_pattern = r'\b(?:\d{1,2}/\d{1,2}/\d{2,4}|\d{4}-\d{2}-\d{2}|\d{1,2} \w+ \d{4})\b'
    dates = re.findall(date_pattern, body)
    return dates

def extract_countries_from_body(body):
    country_pattern = r'\b(?:Afghanistan|Albania|Algeria|Andorra|Angola|Argentina|Armenia|Australia|Austria|Azerbaijan|Bahamas|Bahrain|Bangladesh|Barbados|Belarus|Belgium|Belize|Benin|Bhutan|Bolivia|Botswana|Brazil|Brunei|Bulgaria|Burundi|Cambodia|Cameroon|Canada|Chad|Chile|China|Colombia|Comoros|Congo|Croatia|Cuba|Cyprus|Czech Republic|Denmark|Djibouti|Dominica|Ecuador|Egypt|El Salvador|Estonia|Eswatini|Ethiopia|Fiji|Finland|France|Gabon|Gambia|Georgia|Germany|Ghana|Greece|Grenada|Guatemala|Guinea|Guyana|Haiti|Honduras|Hungary|Iceland|India|Indonesia|Iran|Iraq|Ireland|Israel|Italy|Jamaica|Japan|Jordan|Kazakhstan|Kenya|Kiribati|Kuwait|Kyrgyzstan|Laos|Latvia|Lebanon|Lesotho|Liberia|Libya|Liechtenstein|Lithuania|Luxembourg|Madagascar|Malawi|Malaysia|Maldives|Mali|Malta|Mauritania|Mauritius|Mexico|Micronesia|Moldova|Monaco|Mongolia|Montenegro|Morocco|Mozambique|Myanmar|Namibia|Nauru|Nepal|Netherlands|New Zealand|Nicaragua|Niger|Nigeria|Norway|Oman|Pakistan|Palau|Panama|Paraguay|Peru|Philippines|Poland|Portugal|Qatar|Romania|Russia|Rwanda|Samoa|San Marino|Saudi Arabia|Senegal|Serbia|Seychelles|Singapore|Slovakia|Slovenia|Somalia|South Africa|Spain|Sri Lanka|Sudan|Suriname|Sweden|Switzerland|Syria|Taiwan|Tajikistan|Tanzania|Thailand|Togo|Tonga|Tunisia|Turkey|Turkmenistan|Tuvalu|Uganda|Ukraine|United Arab Emirates|United Kingdom|United States|Uruguay|Uzbekistan|Vanuatu|Vatican|Venezuela|Vietnam|Yemen|Zambia|Zimbabwe)\b'
    countries = re.findall(country_pattern, body, re.IGNORECASE)
    return list(set(countries))

def extract_airport_codes_from_body(body):
    # Regex for 3-letter IATA airport codes (assumes they are all uppercase letters)
    airport_pattern = r'\b[A-Z]{3}\b'
    # Find all matching codes
    airport_codes = re.findall(airport_pattern, body)
    
    # Common filtering step to avoid false positives
    # If desired, compare the codes with a list of valid IATA codes for stricter accuracy
    return list(set(airport_codes))


In [231]:
def main():
    access_token = "ya29.a0ARW5m76332xlvBB-HUiJ2mc10B7QEVqzipxeJiSp3xejOKQevaHf-XT_YTg2BHtfgKsTUuQCUJMYL_VkpBmWIw39ypfr6ntAd29GN4gY48iNsccJO7m8SIkNzKbUmcAuHsd3VaBEqYWng61VuawIb3uL_EblQybwKNC7x23UMAaCgYKAYgSARISFQHGX2MiPxIGQ6zZMWk5AYBqGPeq_A0177" #EDIT HERE access token if using this method
    keywords_or1 = ["flight", "trip", "booking"]  # EDIT HERE: Replace with your keywords
    keywords_or2 = ["itinerary", "reservation", "confirmation", "confirmed","departure"] # EDIT HERE: Replace with your keywords
    yearsago = 5 # EDIT HERE time range as an integer (X years ago)
    
    service = authenticate_gmail_with_access_token(access_token)  # EDIT HERE to change authentication method
    messages = search_emails(service, keywords_or1, keywords_or2, yearsago) 

    if not messages:
        print("No messages found.")
    else:
        print(f"Found {len(messages)} messages:\n")
        
        # Open a CSV file for writing
        with open("email_output.csv", "w", newline="", encoding="utf-8") as csvfile:
            # Define CSV writer and header
            csvwriter = csv.writer(csvfile)
            csvwriter.writerow(["Email ID", "Subject", "Sender", "Date", "Dates in Body", "Countries in Body", "Airport Codes"])

            for msg in messages:
                msg_id = msg['id']
                subject, email_body, sender, date = get_email_details(service, msg_id)

                # Filter emails strictly based on `keywords_or1` in the subject
                if any(keyword.lower() in subject.lower() for keyword in keywords_or1):
                    dates = extract_dates_from_body(email_body)
                    countries = extract_countries_from_body(email_body)
                    airport_codes = extract_airport_codes_from_body(email_body)

                    if dates:
                        # Write data to CSV
                        csvwriter.writerow([
                            msg_id,
                            subject.strip(),
                            sender.strip(),
                            date.strip(),
                            ", ".join(dates),  # Dates as a single string
                            ", ".join(countries) if countries else "None",  # Countries as a single string
                            ", ".join(airport_codes) if airport_codes else "None"  # Airport codes as a single string
                        ])
                        print(f"Email ID: {msg_id}\nSubject: {subject}\nSender: {sender}\nDate: {date}\nDates in Body: {', '.join(dates)}\nCountries in Body: {', '.join(countries) if countries else 'None'}\n{'-'*50}\n")
                    else:
                        print(f"Email ID: {msg_id} skipped as it doesn't contain dates in the body.\n")
                else:
                    print(f"Email ID: {msg_id} skipped as it doesn't match 'keywords_or1' in subject.\n")

    print("Output saved to email_output.csv.")

if __name__ == '__main__':
    main()

Found 1114 messages:

Email ID: 193d0f9359df7341 skipped as it doesn't match 'keywords_or1' in subject.

Email ID: 193d03cd32d95d0c skipped as it doesn't contain dates in the body.

Email ID: 193cb581fd550915 skipped as it doesn't match 'keywords_or1' in subject.

Email ID: 193ca565dfff5267 skipped as it doesn't match 'keywords_or1' in subject.

Email ID: 193bd9bc6407f642 skipped as it doesn't match 'keywords_or1' in subject.

Email ID: 193b1d55521a2053 skipped as it doesn't match 'keywords_or1' in subject.

Email ID: 193ac41169b924fb skipped as it doesn't match 'keywords_or1' in subject.

Email ID: 1939bde4582e1c4b skipped as it doesn't match 'keywords_or1' in subject.

Email ID: 1939a4d901533ed1 skipped as it doesn't match 'keywords_or1' in subject.

Email ID: 193953ed44bf13b4 skipped as it doesn't match 'keywords_or1' in subject.

Email ID: 193953b4864385bb skipped as it doesn't match 'keywords_or1' in subject.

Email ID: 1939311b3a7e43b9 skipped as it doesn't match 'keywords_or1' i