In [27]:
import boto3
import json

sns = boto3.client('sns')
aws_lambda = boto3.client('lambda')
iam_client = boto3.client('iam')
role = iam_client.get_role(RoleName='LabRole')
s3 = boto3.resource('s3')
s3_client = boto3.client('s3')

In [4]:
# Create S3
bucket_name = 'patent-bucket-raw'
try:
    response = s3.create_bucket(Bucket=bucket_name)
except s3_client.exceptions.BucketAlreadyExists:
    response = s3_client.list_buckets()


In [28]:
# Create Lambda Function
with open('lambda_function.zip', 'rb') as f:
    lambda_zip = f.read()

try:
    # If function hasn't yet been created, create it
    response = aws_lambda.create_function(
        FunctionName='scarping_patent',
        Runtime='python3.9',
        Role=role['Role']['Arn'],
        Handler='lambda_function.lambda_handler',
        Code=dict(ZipFile=lambda_zip),
        Timeout=10
    )
except aws_lambda.exceptions.ResourceConflictException:
    # If function already exists, update it based on zip
    # file contents
    response = aws_lambda.update_function_code(
    FunctionName='scarping_patent',
    ZipFile=lambda_zip
    )

# Get Lambda Function ARN and Role ARN
lambda_arn = [f['FunctionArn']
                for f in aws_lambda.list_functions()['Functions']
                if f['FunctionName'] == 'scarping_patent'][0]

# Restrict the number of concurrent executions to 10
response = aws_lambda.put_function_concurrency(
        FunctionName='scarping_patent',
        ReservedConcurrentExecutions=10
    )

In [29]:
# This creates a Step Function which orchestrates more complicated workflows
## In this case, it runs multiple Lambda functions in parallel
sfn = boto3.client('stepfunctions')

def make_def(lambda_arn):
    definition = {
      "Comment": "My State Machine",
      "StartAt": "Map",
      "States": {
        "Map": {
          "Type": "Map",
          "End": True,
          "MaxConcurrency": 10,
          "Iterator": {
            "StartAt": "Lambda Invoke",
            "States": {
              "Lambda Invoke": {
                "Type": "Task",
                "Resource": "arn:aws:states:::lambda:invoke",
                "OutputPath": "$.Payload",
                "Parameters": {
                  "Payload.$": "$",
                  "FunctionName": lambda_arn
                },
                "Retry": [
                  {
                    "ErrorEquals": [
                      "Lambda.ServiceException",
                      "Lambda.AWSLambdaException",
                      "Lambda.SdkClientException",
                      "Lambda.TooManyRequestsException",
                      "States.TaskFailed"
                    ],
                    "IntervalSeconds": 2,
                    "MaxAttempts": 6,
                    "BackoffRate": 2
                  }
                ],
                "End": True
              }
            }
          }
        }
      }
    }
    return definition

In [30]:
sf_def = make_def(lambda_arn)

# Create new State machine
try:
    response = sfn.create_state_machine(
        name='patent_machine',
        definition=json.dumps(sf_def),
        roleArn=role['Role']['Arn'],
        type='EXPRESS'
    )
except sfn.exceptions.StateMachineAlreadyExists:
    response = sfn.list_state_machines()
    state_machine_arn = [sm['stateMachineArn'] 
                         for sm in response['stateMachines'] 
                         if sm['name'] == 'test_step_machine'][0]
    response = sfn.update_state_machine(
        stateMachineArn=state_machine_arn,
        definition=json.dumps(sf_def),
        roleArn=role['Role']['Arn']
    )

In [31]:
response = sfn.list_state_machines()
response

{'stateMachines': [{'stateMachineArn': 'arn:aws:states:us-east-1:840177592390:stateMachine:patent_machine',
   'name': 'patent_machine',
   'type': 'EXPRESS',
   'creationDate': datetime.datetime(2024, 5, 6, 20, 30, 8, 49000, tzinfo=tzlocal())}],
 'ResponseMetadata': {'RequestId': '953f7aa8-1abf-42b8-9ad6-40e35ea584dd',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '953f7aa8-1abf-42b8-9ad6-40e35ea584dd',
   'date': 'Tue, 07 May 2024 01:32:35 GMT',
   'content-type': 'application/x-amz-json-1.0',
   'content-length': '180',
   'connection': 'keep-alive'},
  'RetryAttempts': 0}}

In [32]:
import pandas as pd
import numpy as np

def split_csv_to_json(csv_path, n):
    # 读取 CSV 文件
    data = pd.read_csv(csv_path)
    
    # 去除 'id' 和 'result_link' 列中含有缺失值的行
    data.dropna(subset=['id', 'result link'], inplace=True)
    
    # 计算每个分割的理想大小
    chunk_size = len(data) // n
    remainder = len(data) % n

    # 创建一个空列表来存储最终的 JSON 对象
    json_list = []
    
    # 分割数据并处理每一部分
    for i in range(n):
        # 如果最后一块，包括所有剩余的行
        if i < remainder:
            part_data = data.iloc[i * (chunk_size + 1):(i + 1) * (chunk_size + 1)]
        else:
            start_idx = remainder * (chunk_size + 1) + (i - remainder) * chunk_size
            end_idx = start_idx + chunk_size
            part_data = data.iloc[start_idx:end_idx]
        
        # 从部分数据中提取 'id' 和 'result_link' 列
        ids = part_data['id'].tolist()
        links = part_data['result link'].tolist()
        
        # 创建一个 JSON 对象
        json_obj = {'ids': ids, 'result_links': links}
        
        # 将 JSON 对象转换为字符串并添加到列表中
        json_list.append(json_obj)
    
    return json_list

# 示例用法
json_output = split_csv_to_json('all_patents_link.csv', 5)
print(json_output)


[{'ids': ['US-10171659-B2', 'US-2019012595-A1'], 'result_links': ['https://patents.google.com/patent/US10171659B2/en', 'https://patents.google.com/patent/US20190012595A1/en']}, {'ids': ['US-2019018692-A1', 'US-2019005195-A1'], 'result_links': ['https://patents.google.com/patent/US20190018692A1/en', 'https://patents.google.com/patent/US20190005195A1/en']}, {'ids': ['US-10192163-B2', 'US-10186155-B2'], 'result_links': ['https://patents.google.com/patent/US10192163B2/en', 'https://patents.google.com/patent/US10186155B2/en']}, {'ids': ['US-2019034192-A1', 'US-2019034198-A1'], 'result_links': ['https://patents.google.com/patent/US20190034192A1/en', 'https://patents.google.com/patent/US20190034198A1/en']}, {'ids': ['US-2019034256-A1'], 'result_links': ['https://patents.google.com/patent/US20190034256A1/en']}]


In [25]:
json1 = json_output[0]

In [19]:
r = aws_lambda.invoke(FunctionName='scarping_patent',
                      InvocationType='RequestResponse',
                      Payload=json.dumps(json1))
json.loads(r['Payload'].read()) # print out response

In [33]:
state_machine_arn = [sm['stateMachineArn'] 
                     for sm in response['stateMachines'] 
                     if sm['name'] == 'patent_machine'][0]

# Call Lambda functions 10 times in parallel
sfn_response = sfn.start_sync_execution(
    stateMachineArn=state_machine_arn,
    name='sync_test',
    input=json.dumps(json_output)
)

print(sfn_response['output'])

[null,null,null,null,null]


In [26]:
# Delete each pipeline component if it still exists:
# Lambda
try:
    aws_lambda.delete_function(FunctionName="scraping_patent")
    print("Lambda Function Deleted")
except aws_lambda.exceptions.ResourceNotFoundException:
    print("AWS Lambda Function Already Deleted")
    
event_source_uuids = [mapping['UUID'] for mapping in aws_lambda.list_event_source_mappings()['EventSourceMappings']]
for uuid in event_source_uuids:
    aws_lambda.delete_event_source_mapping(UUID=uuid)
print("Event source mappings deleted")

# State Machine
try:
    sfn.delete_state_machine(stateMachineArn=state_machine_arn)
    print("State Machine Deleted")
except sfn.exceptions.StateMachineDoesNotExist:
    print("State Machine Already Deleted")


AWS Lambda Function Already Deleted
Event source mappings deleted


In [34]:
# Delet S3
bucket = s3.Bucket(bucket_name)
bucket.objects.all().delete()
bucket.delete()

{'ResponseMetadata': {'RequestId': '0E5GWJ78YHVZV299',
  'HostId': '5ZRmMF55A7bGBMR0om9BtNIpnl8e25f7ucxpiYfb+H4+rESq/+Xn/5ugu6uohq/V7ly3N/03m3U=',
  'HTTPStatusCode': 204,
  'HTTPHeaders': {'x-amz-id-2': '5ZRmMF55A7bGBMR0om9BtNIpnl8e25f7ucxpiYfb+H4+rESq/+Xn/5ugu6uohq/V7ly3N/03m3U=',
   'x-amz-request-id': '0E5GWJ78YHVZV299',
   'date': 'Tue, 07 May 2024 01:37:05 GMT',
   'server': 'AmazonS3'},
  'RetryAttempts': 0}}