In [1]:
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
import pickle
import os.path
import base64
import email
from tqdm import trange
from bs4 import BeautifulSoup
import pandas as pd
import lxml
from apiclient import errors
import re
from datetime import datetime
import logging

In [100]:
class Email:
    def __init__(self):
        # Define the SCOPES. If modifying it, delete the token.pickle file.
        self.SCOPES = ['https://www.googleapis.com/auth/gmail.readonly', 'https://www.googleapis.com/auth/gmail.modify']
        self.creds = self.getCreds()
        # Connect to the Gmail API
        self.service = build('gmail', 'v1', credentials = self.creds)
        self.df_stock_num2name = pd.read_excel("./src/24932_個股代號及券商名稱.xlsx", index_col = 0, names = ['name'] , sheet_name = 0)
        self.df_investment_company = pd.read_excel("./src/24932_個股代號及券商名稱.xlsx", index_col = 0, names = ['name'], sheet_name = 1)
        self.dict_stock_num2name = self.df_stock_num2name.to_dict(orient = 'dict')['name']
        self.dict_investment_company = self.df_investment_company.to_dict(orient = 'dict')['name']
        self.mail2label2Body = { "addLabelIds": ["Label_2"], "removeLabelIds" : ["INBOX"] }
    
    def getCreds(self):
        # Variable creds will store the user access token.
        # If no valid token found, we will create one.
        creds = None
        
        # The file token.pickle contains the user access token.
        # Check if it exists
        if os.path.exists('token.pickle'):

            # Read the token from the file and store it in the variable creds
            with open('token.pickle', 'rb') as token:
                creds = pickle.load(token)

        # If credentials are not available or are invalid, ask the user to log in.
        if not creds or not creds.valid:
            if creds and creds.expired and creds.refresh_token:
                creds.refresh(Request())
            else:
                flow = InstalledAppFlow.from_client_secrets_file('credentials.json', self.SCOPES)
                creds = flow.run_local_server(port=0)

            # Save the access token in token.pickle file for the next run
            with open('token.pickle', 'wb') as token:
                pickle.dump(creds, token)
        return creds
    
    def getMessages(self, encodedData):
        # Decode message
        try:
            data = encodedData.replace("-","+").replace("_","/")
            decoded_data = base64.b64decode(data)
            decoded_data = decoded_data.decode("utf-8")
            decoded_data = BeautifulSoup(decoded_data, 'html.parser')

            return decoded_data
        except:
            return "Null"
        
    def check_pdf_dir(self):
        if not os.path.isdir("file"):
            rootPath = os.path.abspath(os.getcwd())
            os.mkdir("file")
            
            for key, val in self.dict_stock_num2name.items():
                if not os.path.isdir(rootPath + "/file/" + str(key)):
                    os.mkdir(rootPath + "/file/" + str(key))
    
    def getAttachments(self, encodedFile, msgID, stock_num_name):
        self.check_pdf_dir()
        
        fileName = encodedFile["filename"]
        
        if ".pdf" in fileName:
            for num, name in stock_num_name:
                if num in fileName:
                    try:
                        att = self.service.users().messages().attachments().get(userId = 'me', messageId = msgID, id = encodedFile['body']['attachmentId']).execute()
                        file = att['data']
                        file_data = base64.urlsafe_b64decode(file.encode('UTF-8'))

                        with open("./file/" + num + "/" + fileName, 'wb') as f:
                            f.write(file_data)

                        return num, name, "./file/" + num + "/" + fileName

                    except errors.HttpError:
                        print('An error occurred: %s' % error)
            return "null", "null", "null"
        else:
            return "null", "null", "null"
    
    def getAttachmentsURL(self, content, stock_num_name):
        self.check_pdf_dir()
        
        try:
            span_tags = content.find_all('span')
            for span in range(1, len(span_tags)):
                if re.findall(r"http://research2014.yuanta.com/DL.aspx\?r\=\d{6}", str(span_tags[span])):
                    try:
                        url = re.findall(r"http://research2014.yuanta.com/DL.aspx\?r\=\d{6}", str(span_tags[span]))
                        pdfurl = requests.get(url[0], allow_redirects = True).url
                        filename = urllib.parse.unquote(pdfurl.split("/")[-1])

                        for num, name in stock_num_name:
                            if num in str(filename):
                                file_rename = "./file/" + num + "/" + "元大_" + name + "_" + filename
                                urllib.request.urlretrieve(pdfurl, file_rename)
                                return num, name, file_rename
                    except:
                        print("link error")
                        return "null", "null", "null"
        except:
            a_tags = soup.find_all('a', target = "_blank")
            
            for a in a_tags:
                if re.findall(r"http://www\.wls\.com\.tw/CancelLegal/.+Email=.+EpaperID=.+EpaperClassID=[a-zA-Z0-9\-]+",str(a)):
                    try:
                        url = re.findall(r"http://www\.wls\.com\.tw/CancelLegal/.+Email=.+EpaperID=.+EpaperClassID=[a-zA-Z0-9\-]+",str(a).replace("&amp;","&"))
                        checkpage = re.search(r'checkpage\d?', url[0]).group()
                        EpaperID = re.search(r'EpaperID=[a-zA-Z0-9\-]+', url[0]).group()
                        redirectURL = "http://www.wls.com.tw/CancelLegal/{}.aspx?EpaperID={}".format(
                            checkpage.replace("checkpage", "check"), EpaperID.replace("EpaperID=", ''))
                        pdfurl = requests.get(redirectURL, allow_redirects = True).url
                        filename = urllib.parse.unquote(pdfurl.split("/")[-1])
                        
                        for num, name in stock_num_name:
                            if str(num) in str(filename):
                                file_rename = "./file/" + num + "/" + name + "_" + filename
                                urllib.request.urlretrieve(pdfurl, file_rename)
                                return num, name, file_rename
                    except:            
                        print("link error")
                        return "null", "null", "null"
        return "null", "null", "null"
    
    def getSubject(self, header, display = False):
        for d in header:
            if d['name'] == 'Subject':
                if display:
                    print("Subject: ", d['value'])
                    print("-----" * 20)
                return d['value']
            
    def getDate(self, header, display = False):
        monthMap = { "Jan" : 1, "Feb" : 2, "Mar" : 3, "Apr" : 4, "May" : 5, "Jun" : 6,
           "Jul" : 7, "Aug" : 8, "Sep" : 9, "Oct" : 10, "Nov" : 11, "Dec" : 12 }
        
        for d in header:
            if d['name'] == 'Date':
                date = d['value'][5:]
                date = date.replace(" ", ",")
                temp = date.split(",")
                day = temp[0].zfill(2)
                month = str(monthMap[temp[1]])
                year = temp[2]

                date = year + "_" + month + "_" + day
                
                if display:
                    print("Date: ", date)
                    print("-----" * 20)
                return date
    
    def modifyLabels(self, msgID, requestBody):
        result = self.service.users().messages().modify(userId = 'me', id = msgID, body = requestBody).execute()
        
        return result
        
    def get(self):
        # Create log file
        if not os.path.isdir("log"):
            os.mkdir("log")
        
        FORMAT = '%(asctime)s %(levelname)s: %(message)s'
        logging.basicConfig(level = logging.INFO, filename = "./log/" + datetime.now().strftime("%Y_%m_%d") + '.log', filemode = 'w', format = FORMAT)
        logging.info('Updating email start')
        
        Num, Name, Date, Path = [[] for i in range(4)]

        # request a list of all the messages
        result = self.service.users().messages().list(userId = 'me', maxResults = 500, labelIds = ["INBOX"]).execute()
        messages = result.get('messages')
        
        if len(messages) == 0:
            return "No email"

        # iterate through all the messages
        for i in trange(len(messages)):
#             if i != 160:
#                 continue
                
            # Get the message from its id
            txt = self.service.users().messages().get(userId = 'me', id = messages[i]['id']).execute()
            payload = txt['payload']
            headers = payload['headers']
            date = self.getDate(headers)
            subject = self.getSubject(headers)
            
            # Filter email
            if subject != '' and re.findall(r'^\d{4}(?=[^\d\/])|(?<=[^\d])\d{4}(?=[^\d\/])', subject):
                investment_company_res = [key for key, value in self.dict_investment_company.items() if key in subject]
                filter_subject_date_list = re.findall(r'^\d{4}(?=[^\d\/])|(?<=[^\d])\d{4}(?=[^\d\/])', subject)
                stock_res = [[str(key), value] for key, value in self.dict_stock_num2name.items() if str(key) in filter_subject_date_list]

                # Get email attachment
                try:
                    for j in range(1, len(payload['parts'])):
                        num, name, path = self.getAttachments(payload['parts'][j], messages[i]['id'], stock_res)

                        if path != "null":
                            Num.append(num)
                            Name.append(name)
                            Date.append(date)
                            Path.append(path)
#                     # Modify labels
#                     self.modifyLabels(messages[i]['id'], self.mail2label2Body)
                except:
                    content = self.getMessages(payload["body"]["data"])
                    num, name, path = self.getAttachmentsURL(content, stock_res)
                
                    if path != "null":
                        Num.append(num)
                        Name.append(name)
                        Date.append(date)
                        Path.append(path)
                        
#                         logging.error(subject)
            else:
                # Modify labels
                body = {
                    "addLabelIds": ["Label_3"],
                    "removeLabelIds" : ["INBOX"]
                }

                self.modifyLabels(messages[i]['id'], body)
        
        df = pd.DataFrame({ "Number" : Num, "Name" : Name, "Date" : Date, "File path" : Path })
        logging.info('Updating email end')
        
        return df

In [None]:
email = Email()
emailDF = email.get()

  1%|▎                                          | 4/500 [00:01<03:10,  2.60it/s]

In [52]:
# emailDF.to_csv("1.csv", index = False)
emailDF

Unnamed: 0,Number,Name,Date,File path
