In [1]:
# INGEST LAMBDA - SCRAPES THE SITE
import os
import time
import boto3
import requests
import pandas as pd
from bs4 import BeautifulSoup

s3 = boto3.resource('s3')
raw_bucket = 'darrin-testing'

def send_txt_to_s3(filename,data,bucket):
    with open(filename,'w') as file:
        file.write(str(data))
    s3.Bucket(bucket).put_object(Key=filename,Body=open(filename,'rb'))
    os.remove(filename)
    return True

manga_list = ['http://www.tenmanga.com/book/KINGDOM.html',
              'http://www.tenmanga.com/book/Hardcore+Leveling+Warrior']

for manga in manga_list:
    r = requests.get(manga)
    soup = BeautifulSoup(r.text, 'html.parser')

    # Add code to extract the manga name
    manga_names = [soup.find_all('div',{'class': 'book-info'})[0].find('h1').text]
    
    # Add code to extract the raw list of chapters
    chapter_list_raw = soup.find_all('ul',{'class': 'chapter-box'})
    
    # Send the raw chapter list and manga names to S3
    manga = manga_names[0].replace(' ','_')
    import_time = time.strftime("%Y-%m-%d_%H_%M_%S")
    filename = '_'.join([manga,'info',import_time]) + '.txt'
    data = manga_names + [str(chapter_list_raw[0])]
    send_txt_to_s3(filename,data,raw_bucket)


In [None]:
# INITIAL PROCESSING LAMBDA - FOR NOW, DOES NOTHING BUT COPY
# add code to copy the file from the raw bucket to the processed bucket
# you can use this example: https://medium.com/@stephinmon.antony/aws-lambda-with-python-examples-2eb227f5fafe

In [2]:
# PROCESSING LAMBDA - PROCESSES INGESTED FILES
import os
import ast 
import time
import json
import boto3
from bs4 import BeautifulSoup

def send_dict_to_s3(filename,data,bucket):
    import_time = time.strftime("%Y-%m-%d_%H_%M_%S")
    with open(filename,'w') as outfile:
        json.dump(data,outfile)
    s3.Bucket(bucket).put_object(Key=filename,Body=open(filename,'rb'))
    os.remove(filename)
    return True

def data_lengths_test(data):
    if len(data['chapter']) == len(data['date_uploaded']) == len(data['url']):
        return True
    else:
        raise ValueError('Data Column Lengths are not equal')

# Connect to s3 buckets
s3 = boto3.resource('s3',region_name='us-east-2')
processed_bucket = 'darrin-testing'
published_bucket = 'darrin-testing'

# Download file that was just added
filename_from_event = 'Hardcore_Leveling_Warrior_Manga_info_2019-03-24_09_53_47.txt'
s3.Bucket(processed_bucket).download_file(filename_from_event,filename_from_event)

# Read in data
with open(filename_from_event,'r') as data:
    data = ast.literal_eval(data.read())
os.remove(filename_from_event)
manga_name = data[0]
chapter_list_raw = BeautifulSoup(data[1], 'html.parser')

# Parse for chapter information
chapter_links = []; chapter_names = []; date_uploads = [] 
for chapter in chapter_list_raw.find_all('li',{'class':None}):
    chapter_links.append(chapter.find('div',{'class': 'chapter-name short'}).a.get('href'))
    date_uploads.append(chapter.find('div',{'class': 'add-time page-hidden'}).text)
    chapter_name_parts = chapter.find('div',{'class': 'chapter-name short'}).text.split(' ')
    if chapter_name_parts[-1].strip()[-3:] == 'new':
        chapter_names.append(chapter_name_parts[-1].strip()[:-3])
    else:
        chapter_names.append(chapter_name_parts[-1])

# Create dict and send data
data = {'title':manga_name,
        'chapter':chapter_names, 
        'date_uploaded':date_uploads, 
        'url':chapter_links}

# Run Unit Tests
data_lengths_test(data)

# It doesn't matter if the file extension is json or txt
# - that just helps the machine know what to open it with
filename = filename_from_event.replace('info','parsed')
send_dict_to_s3(filename,data,published_bucket)

True

In [2]:
# COMPARISON LAMBDA - ADD NEW LINES TO DYNAMODB TABLE
import os
import time
import json
import boto3
from decimal import Decimal, InvalidOperation
from boto3.dynamodb.conditions import Key

def ix_to_remove(data,response):
    new_chapter_list = data['chapter']
    new_date_uploaded_list = data['date_uploaded']
    new_url_list = data['url']
    remove_ix = []
    for item in response['Items']:
        try:
            remove_ix.append(new_chapter_list.index(str(item['chapter'])))
        except ValueError:
            # If can't find the item, it's new
            pass
    return remove_ix

# Connect to processed data bucket
s3 = boto3.resource('s3',region_name='us-east-2')
published_bucket = 'darrin-testing'

# Connect to DynamoDB table
dynamodb = boto3.resource('dynamodb',region_name='us-east-1')
table = dynamodb.Table('manga_chapters')

# Download file that was just added and load data
filename_from_event = 'Hardcore_Leveling_Warrior_Manga_parsed_2019-03-24_09_53_47.txt'
s3.Bucket(published_bucket).download_file(filename_from_event,filename_from_event)
with open(filename_from_event) as json_file:
    data = json.load(json_file)
os.remove(filename_from_event)
    
# Query the DynamoDB table for that title
title = data['title']
response = table.query(TableName='manga_chapters',
                       KeyConditionExpression=Key('title').eq(title))

# If no records at present, insert all records as new
if response['Count'] == 0:
    print('This is a whole new manga')
    # Need to add all items
    for i in range(len(data['chapter'])):
        try:
            table.put_item(
                Item = {
                    'chapter': Decimal(data['chapter'][i]),
                    'date_ingested': time.strftime("%Y-%m-%d"),
                    'date_uploaded': data['date_uploaded'][i],
                    'title': title,
                    'url': data['url'][i]
                }
            )
        except InvalidOperation:
            print('{} not added.'.format(data['chapter'][i]))
# If records exist, only insert new records
else:
    print('There are chapters for this manga')
    # Need to compare chapter numbers
    remove_ix = ix_to_remove(data,response)

    for index in sorted(remove_ix,reverse=True):
        for my_list in [data['chapter'],data['date_uploaded'],data['url']]:
            del my_list[index]

    for i in range(len(data['chapter'])):
        try:
            table.put_item(
                Item = {
                    'title': title,
                    'chapter': Decimal(data['chapter'][i]),
                    'date_ingested': time.strftime("%Y-%m-%d"),
                    'date_uploaded': data['date_uploaded'][i],
                    'url': data['url'][i]
                }
            )        
        except InvalidOperation:
            print('{} not added.'.format(data['chapter'][i]))

There are chapters for this manga
