## Gmail Job Scanner Web App for Mechanical Fitter Roles

This Jupyter notebook implements a full pipeline that:
- 1. Authenticates to Gmail.
- 2. Scans recent emails.
- 3. Uses a free GPT model (via DeepInfra or Hugging Face) to determine if an email contains a job offer for a mechanical fitter.
- 4. Extracts structured data (worksite, pay, dates, etc.).
- 5. Saves valid job offers to a list.
- 6. Adds each job offer to Google Calendar.

## Step 1: Install Required Packages

In [None]:
!pip install --quiet --upgrade google-api-python-client google-auth google-auth-oauthlib google-auth-httplib2 openai requests

## Step 2: Authenticate to Gmail and Google Calendar

In [None]:
import os
import base64
import email
from datetime import datetime, timedelta
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build

# Define the scopes
SCOPES = [
    'https://www.googleapis.com/auth/gmail.readonly',
    'https://www.googleapis.com/auth/calendar'
]

# Authenticate and create service clients
def authenticate_google_services(token_file='token.json',cred_file='credentials.json',scopes=SCOPES):
    creds = None
    if os.path.exists(token_file):
        creds = Credentials.from_authorized_user_file(token_file, scopes)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(cred_file, scopes)
            creds = flow.run_local_server(port=0)
        with open(token_file, 'w') as token:
            token.write(creds.to_json())

    gmail = build('gmail', 'v1', credentials=creds)
    calendar = build('calendar', 'v3', credentials=creds)
    return gmail, calendar

gmail_service, calendar_service = authenticate_google_services()

## Step 3: Fetch Recent Emails
Docs: 

https://developers.google.com/workspace/gmail/api/guides/list-messages

https://developers.google.com/workspace/gmail/api/reference/rest/v1/users.messages/list

In [None]:
import base64
from bs4 import BeautifulSoup

def extract_body(part):
    # If the part has its own parts, search them
    if 'parts' in part:
        for subpart in part['parts']:
            text = extract_body(subpart)
            if text:
                return text
    else:
        # Try plain text first
        if part.get('mimeType') == 'text/plain' and 'data' in part.get('body', {}):
            return base64.urlsafe_b64decode(part['body']['data']).decode('utf-8')
        # If not, fall back to HTML and strip tags
        elif part.get('mimeType') == 'text/html' and 'data' in part.get('body', {}):
            html = base64.urlsafe_b64decode(part['body']['data']).decode('utf-8')
            return BeautifulSoup(html, 'html.parser').get_text()
    return ''

In [None]:
from datetime import datetime, timedelta
import time
import pytz

def fetch_recent_emails(gmail_service, time_delta_hours=1000, max_results=1000):
    # Calculate UNIX timestamp for 24 hours ago
    start_time = datetime.utcnow() - timedelta(hours=time_delta_hours)
    after_timestamp = int(time.mktime(start_time.timetuple()))
    
    print("time_delta_hours:",time_delta_hours)
    print("After timestamp:", after_timestamp)
    print("UTC time:", datetime.utcfromtimestamp(after_timestamp))

    # Filter the emails being searched. Doing this general filter is much more efficient than a GPT
    query = (
        f"after:{after_timestamp} "
        + '(job OR shutdown OR shutdowns OR fitter OR fitters OR fifo OR shut OR shuts)'
    )
    

    # Use the query to fetch only recent emails
    results = gmail_service.users().messages().list(
        userId='me',
        maxResults=max_results,
        q=query
    ).execute()

    messages = results.get('messages', [])
    emails = []

    for msg in messages:
        msg_data = gmail_service.users().messages().get(userId='me', id=msg['id'], format='full').execute()
        payload = msg_data['payload']
        headers = payload.get('headers', [])
        subject = next((h['value'] for h in headers if h['name'] == 'Subject'), '(No Subject)')
        sender = next((h['value'] for h in headers if h['name'] == 'From'), '(Unknown)')
        
        # Convert internalDate to Perth timezone
        internal_ts = int(msg_data.get('internalDate', 0)) / 1000  # in seconds
        utc_dt = datetime.utcfromtimestamp(internal_ts).replace(tzinfo=pytz.utc)
        perth_dt = utc_dt.astimezone(pytz.timezone('Australia/Perth'))
        received_datetime = perth_dt.strftime('%Y-%m-%d %H:%M:%S %Z')
        
        thread_id = msg['threadId']

        body = extract_body(payload).replace('\r\n', '\n').replace('\r', '\n')

        emails.append({
            'subject': subject,
            'sender': sender,
            'body': body,
            'thread_id': thread_id,
            'received_datetime' : received_datetime
        })

    return emails

## Step 4: Send to GPT for Extraction

# Prompt Engineering

## Instructions to GPT
You are an expert assistant for detecting job opportunities for **mechanical fitters** or **riggers** in **mining shutdowns** in Australia.

Analyze the full email thread content and return a structured JSON object **only if** there is a genuine and current work opportunity. Otherwise, return `"is_work_opportunity": false` and leave all other fields as empty lists.

## Relevance Criteria
Only return a result if the email includes one of:
- A job ad, invitation to apply, or request for availability
- Shutdown schedule confirmation or a start/end date
- (Ignore rosters)

## Extraction Fields (all as **lists**):
- `workplace`: Names of mines/sites.
- `start_date`, `end_date`: Format as `YYYY-MM-DD`. Today is {current_date}.
- `day_shift_rate`, `night_shift_rate`: Float values (e.g., 655.00).
- `position`: "Fitter" or "Rigger".
- `clean_shaven`: True or False.
- `client_name`: Extract from sender's domain; take only the first part (e.g., from `downergroup.com.au` → `downergroup`).
- `contact_number`: Digits only, no spaces.
- `email_address`: Valid contact emails.

> Ensure all lists are the same length. Duplicate or align entries as needed. Use dummy values if specific details are missing.

## Output Format
Return the following JSON object with **all keys present**, even if values are empty:
```json
{{
  "is_work_opportunity": true,
  "workplace": [...],
  "start_date": [...],
  "end_date": [...],
  "day_shift_rate": [...],
  "night_shift_rate": [...],
  "position": [...],
  "clean_shaven": [...],
  "client_name": [...],
  "contact_number": [...],
  "email_address": [...]
}}
```

In [None]:
import time

# Get current date in YYYY-MM-DD
current_year = time.gmtime().tm_year 
current_month = time.gmtime().tm_mon
current_day = time.gmtime().tm_mday 
current_date = f'{current_year}-{current_month:02d}-{current_day:02d}'

PROMPT_INSTRUCTIONS = f"""
You are an expert assistant for detecting job opportunities for **mechanical fitters** or **riggers** in **mining shutdowns** in Australia.

Analyze the full email thread content and return a structured JSON object **only if** there is a genuine and current work opportunity. Otherwise, return `"is_work_opportunity": false` and leave all other fields as empty lists.

## Relevance Criteria
Only return a result if the email includes one of:
- A job ad, invitation to apply, or request for availability
- Shutdown schedule confirmation or a start/end date
- (Ignore rosters)

## Extraction Fields (all as **lists**):
- `workplace`: Names of mines/sites.
- `start_date`, `end_date`: Format as `YYYY-MM-DD`. Today is {current_date}.
- `day_shift_rate`, `night_shift_rate`: Float values (e.g., 655.00).
- `position`: "Fitter" or "Rigger".
- `clean_shaven`: True or False.
- `client_name`: Extract from sender's domain; take only the first part (e.g., from `downergroup.com.au` → `downergroup`).
- `contact_number`: Digits only, no spaces.
- `email_address`: Valid contact emails.

> Ensure all lists are the same length. Duplicate or align entries as needed. Use dummy values if specific details are missing.

## Output Format
Return the following JSON object with **all keys present**, even if values are empty:
{{
  "is_work_opportunity": true,
  "workplace": [...],
  "start_date": [...],
  "end_date": [...],
  "day_shift_rate": [...],
  "night_shift_rate": [...],
  "position": [...],
  "clean_shaven": [...],
  "client_name": [...],
  "contact_number": [...],
  "email_address": [...]
}}
"""

## Step 5: Parse and Save Job Offers

In [None]:
with open('OPENAI_API_KEY.txt', 'r') as f:
    key_value = f.readline().strip() # Reads the first line and removes whitespace
print(OPENAI_API_KEY)

In [None]:
import json

def remove_code_fences(text):
    lines = text.strip().splitlines()
    if lines and lines[0].strip().startswith("```"):
        lines = lines[1:]  # remove first line
    if lines and lines[-1].strip() == "```":
        lines = lines[:-1]  # remove last line
    return "\n".join(lines)

from openai import OpenAI
client = OpenAI(api_key=OPENAI_API_KEY)

def query_gpt_model(email_body, MODEL="gpt-4o", INSTRUCTIONS=PROMPT_INSTRUCTIONS, INPUT=""):
    return client.responses.create(
        model=MODEL,
        instructions=INSTRUCTIONS,
        input=INPUT + email_body  # If `input` is expected as a plain prompt
    )

import json
import traceback

def process_emails_for_jobs(emails):
    job_offers = []

    for email_obj in emails:
        email_preview = email_obj.get('subject', '')[:50] or email_obj.get('body', '')[:50]
        
        try:
            model_output = query_gpt_model(email_obj['body']).output_text
            cleaned_output = remove_code_fences(model_output)
            parsed = json.loads(cleaned_output)
        except Exception as e:
            # Don't debug here — only care if it's a work opportunity
            continue
        
        # Only handle if it's flagged as a work opportunity
        if parsed.get('is_work_opportunity') is True:
            try:
                required_keys = ['workplace', 'start_date',
                                 'end_date', 'day_shift_rate', 'night_shift_rate', 'position',
                                 'clean_shaven', 'client_name', 'contact_number', 'email_address']

                # Check key presence
                if not all(key in parsed for key in required_keys):
                    print(f"[DEBUG] Missing keys in GPT output for email preview: '{email_preview}'")
                    continue

                # Step 1: Collect list lengths
                list_lengths = {key: len(parsed[key]) for key in required_keys if isinstance(parsed[key], list)}

                # Step 2: Check for inconsistency
                if len(set(list_lengths.values())) != 1:
                    print(f"[DEBUG] Inconsistent list lengths in work data for email: '{email_preview}'")
                    for key, length in list_lengths.items():
                        print(f"  - {key}: {length} -> {parsed[key]}")

                    # Step 3: Normalize by padding with last item (or empty string if list is empty)
                    max_length = max(list_lengths.values())
                    for key in required_keys:
                        if isinstance(parsed.get(key), list):
                            current_list = parsed[key]
                            while len(current_list) < max_length:
                                current_list.append(current_list[-1] if current_list else "")


                # Extract and store job offers
                max_len = max(list_lengths.values())
                for i in range(max_len):
                    job_offers.append({
                        'workplace': parsed['workplace'][i],
                        'start_date': parsed['start_date'][i],
                        'end_date': parsed['end_date'][i],
                        'day_shift_rate': parsed['day_shift_rate'][i],
                        'night_shift_rate': parsed['night_shift_rate'][i],
                        'position': parsed['position'][i],
                        'clean_shaven': parsed['clean_shaven'][i],
                        'client_name': parsed['client_name'][i],
                        'contact_number': parsed['contact_number'][i],
                        'email_address': parsed['email_address'][i],
                        'email_thread_link': f"https://mail.google.com/mail/u/0/#inbox/{email_obj['thread_id']}"
                    })

            except Exception as e:
                print(f"[DEBUG] Failed to process work opportunity from email: '{email_preview}'")
                print(f"[DEBUG] Error: {e}")
                traceback.print_exc()

    return job_offers

## Step 6: Add entries to Google calendar

In [None]:
def list_google_calendars(calendar_service):
    calendars_result = calendar_service.calendarList().list().execute()
    calendars = calendars_result.get('items', [])

    for cal in calendars:
        print(f"{cal.get('summary')}\t: {cal.get('id')}")
    
# list_google_calendars(calendar_service)

In [None]:
def clear_calendar(calendar_service, calendar_id):
    page_token = None
    while True:
        events = calendar_service.events().list(
            calendarId=calendar_id,
            pageToken=page_token,
            showDeleted=False,
            maxResults=2500  # API max limit per page
        ).execute()

        items = events.get('items', [])
        if not items:
            print("No more events to delete.")
            break

        for event in items:
            try:
                calendar_service.events().delete(
                    calendarId=calendar_id,
                    eventId=event['id']
                ).execute()
                print(f"Deleted: {event.get('summary', 'No Title')}")
            except Exception as e:
                print(f"Failed to delete event: {e}")

        page_token = events.get('nextPageToken')
        if not page_token:
            break

In [None]:
def add_jobs_to_calendar(job_offers, calendar_service, calendar_id):
    for job in job_offers:
        summary = f"{job['workplace']} | ${job['day_shift_rate']}/day & ${job['night_shift_rate']}/night | {job['client_name']}"
        start_date = job['start_date']
        end_date = job['end_date']

        # Define the event to insert
        event = {
            'summary': summary,
            'description': f"""
Link to Email Thread: {job['email_thread_link']}

Site: {job['workplace']}
Day Shift Rate: {job['day_shift_rate']}
Night Shift Rate: {job['night_shift_rate']}

Position: {job['position']}
Clean Shaven: {job['clean_shaven']}

Contact Email: mailto:{job['email_address']}
Phone: tel:{job['contact_number']}
""",
            'start': {
                'date': start_date,
                'timeZone': 'Australia/Perth',
            },
            'end': {
                'date': end_date,
                'timeZone': 'Australia/Perth',
            },
            'event_type': 'workingLocation',
            'location': f"{job['workplace']}"
        }

        try:
            # Search for existing events with the same summary on the same day
            existing_events = calendar_service.events().list(
                calendarId=calendar_id,
                q=summary,
                timeMin=f"{start_date}T00:00:00+08:00",
                timeMax=f"{end_date}T23:59:59+08:00",
                singleEvents=True
            ).execute()

            if existing_events.get('items'):
                print(f"Skipped duplicate event: {summary} on {start_date}")
                continue  # Skip adding this event

            # Insert new event
            event_result = calendar_service.events().insert(calendarId=calendar_id, body=event).execute()
            print("Calendar entry added:", summary, event_result.get('htmlLink'))

        except Exception as e:
            print("Failed to add calendar entry:", e)


# TESTING

In [None]:
# Get access to gmail and calendar
gmail_service, calendar_service = authenticate_google_services()

In [None]:
# Get job offers from emails
num_days = 7
num_hours = num_days * 24
emails = fetch_recent_emails(gmail_service, time_delta_hours=num_hours,max_results=100000)

In [None]:
# count = 0
# for email in emails:
#     print(f"Email no. {count} ({email['received_datetime']}) : {email['subject']}")
#     count = count+1

In [None]:
job_offers = process_emails_for_jobs(emails)

In [None]:
clear_calendar(calendar_service,calendar_id)

In [None]:
add_jobs_to_calendar(job_offers,calendar_service,calendar_id)