In [1]:
!pip install findspark

Collecting findspark
  Using cached findspark-2.0.1-py2.py3-none-any.whl.metadata (352 bytes)
Using cached findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [2]:
from typing import List
from datetime import datetime
from pyspark import SparkContext, RDD
from pyspark.sql import SparkSession
import sys
import pyspark
import time
import findspark
findspark.init()

In [3]:
sc = pyspark.SparkContext.getOrCreate()

In [4]:
log_rdd = sc.textFile('data/log.txt')
log_rdd

data/log.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

In [5]:
log_rdd.take(10)

['192.217.41.39 | [25/Feb/2023:05:16:25] | "PATCH /lists HTTP/1.1" | 300',
 '199.211.153.193 | [25/Feb/2023:05:16:25] | "PATCH /events HTTP/1.1" | 404',
 '44.123.5.41 | [25/Feb/2023:05:16:25] | "PATCH /playbooks HTTP/1.1" | 301',
 '187.116.192.22 | [25/Feb/2023:05:16:25] | "PUT /customers HTTP/1.1" | 503',
 '95.233.74.58 | [25/Feb/2023:05:16:26] | "POST /events HTTP/1.1" | 503',
 '163.182.179.76 | [25/Feb/2023:05:16:26] | "PATCH /customers HTTP/1.1" | 500',
 '136.214.175.83 | [25/Feb/2023:05:16:27] | "POST /alerts HTTP/1.1" | 200',
 '148.87.176.94 | [25/Feb/2023:05:16:27] | "PATCH /fieldsets HTTP/1.1" | 500',
 '10.93.154.10 | [25/Feb/2023:05:16:27] | "GET /alerts HTTP/1.1" | 204',
 '38.32.236.30 | [25/Feb/2023:05:16:27] | "PUT /lists HTTP/1.1" | 204']

In [6]:
# check count
print(f"count of RDD ==> {log_rdd.count()}")

count of RDD ==> 2060


In [8]:
# a) map
# a-1) log.txt 의 각 행을 List[str] 형태로 받아오기.
def parse_line(row: str):
    return row.strip().split(" | ")

parsed_log_rdd = log_rdd.map(parse_line)
parsed_log_rdd.take(10)

[['192.217.41.39', '[25/Feb/2023:05:16:25]', '"PATCH /lists HTTP/1.1"', '300'],
 ['199.211.153.193',
  '[25/Feb/2023:05:16:25]',
  '"PATCH /events HTTP/1.1"',
  '404'],
 ['44.123.5.41',
  '[25/Feb/2023:05:16:25]',
  '"PATCH /playbooks HTTP/1.1"',
  '301'],
 ['187.116.192.22',
  '[25/Feb/2023:05:16:25]',
  '"PUT /customers HTTP/1.1"',
  '503'],
 ['95.233.74.58', '[25/Feb/2023:05:16:26]', '"POST /events HTTP/1.1"', '503'],
 ['163.182.179.76',
  '[25/Feb/2023:05:16:26]',
  '"PATCH /customers HTTP/1.1"',
  '500'],
 ['136.214.175.83',
  '[25/Feb/2023:05:16:27]',
  '"POST /alerts HTTP/1.1"',
  '200'],
 ['148.87.176.94',
  '[25/Feb/2023:05:16:27]',
  '"PATCH /fieldsets HTTP/1.1"',
  '500'],
 ['10.93.154.10', '[25/Feb/2023:05:16:27]', '"GET /alerts HTTP/1.1"', '204'],
 ['38.32.236.30', '[25/Feb/2023:05:16:27]', '"PUT /lists HTTP/1.1"', '204']]

In [9]:
# b) filter
# b-1) status code가 404인 log만 필터링.
def get_only_404(row: List[str]):
    status_code = row[3]
    return status_code == '404'

rdd_404 = parsed_log_rdd.filter(get_only_404)
rdd_404.take(5)

[['199.211.153.193',
  '[25/Feb/2023:05:16:25]',
  '"PATCH /events HTTP/1.1"',
  '404'],
 ['173.43.141.206',
  '[25/Feb/2023:05:16:30]',
  '"POST /collectors HTTP/1.1"',
  '404'],
 ['37.40.113.2', '[25/Feb/2023:05:16:26]', '"PATCH /alerts HTTP/1.1"', '404'],
 ['13.10.183.239', '[25/Feb/2023:05:16:28]', '"POST /users HTTP/1.1"', '404'],
 ['183.235.75.252',
  '[25/Feb/2023:05:16:30]',
  '"GET /collectors HTTP/1.1"',
  '404']]

In [10]:
# b-2) status code가 정상인 경우(2xx)인 log만 필터링.
def get_only_2xx(row: List[str]):
        status_code = row[3]
        return status_code.startswith("2")

rdd_normal = parsed_log_rdd.filter(get_only_2xx)
rdd_normal.take(5)

[['136.214.175.83',
  '[25/Feb/2023:05:16:27]',
  '"POST /alerts HTTP/1.1"',
  '200'],
 ['10.93.154.10', '[25/Feb/2023:05:16:27]', '"GET /alerts HTTP/1.1"', '204'],
 ['38.32.236.30', '[25/Feb/2023:05:16:27]', '"PUT /lists HTTP/1.1"', '204'],
 ['8.39.222.57',
  '[25/Feb/2023:05:16:29]',
  '"DELETE /fieldsets HTTP/1.1"',
  '204'],
 ['160.194.15.244', '[25/Feb/2023:05:16:26]', '"PUT /auth HTTP/1.1"', '204']]

In [11]:
# b-3) post 요청이고 /playbooks api인 log만 필터링.
def get_post_request_and_playbooks_api(row: List[str]):
    log = row[2].replace("\"", "")
    return log.startswith("POST") and "/playbooks" in log

rdd_post_playbooks = parsed_log_rdd \
        .filter(get_post_request_and_playbooks_api)
rdd_post_playbooks.take(5)

[['117.3.196.239',
  '[25/Feb/2023:05:16:27]',
  '"POST /playbooks HTTP/1.1"',
  '200'],
 ['173.214.123.108',
  '[25/Feb/2023:05:16:38]',
  '"POST /playbooks HTTP/1.1"',
  '301'],
 ['36.8.178.226',
  '[25/Feb/2023:05:16:37]',
  '"POST /playbooks HTTP/1.1"',
  '300'],
 ['46.58.250.211',
  '[25/Feb/2023:05:16:38]',
  '"POST /playbooks HTTP/1.1"',
  '400'],
 ['70.52.162.100',
  '[25/Feb/2023:05:16:40]',
  '"POST /playbooks HTTP/1.1"',
  '300']]

In [12]:
# c) reduce
# c-1) API method (POST/GET/PUT/PATCH/DELETE) 별 개수 출력.
def extract_api_method(row: List[str]):
    api_log = row[2].replace("\"", "")
    api_method = api_log.split(" ")[0]
    return api_method, 1

rdd_count_by_api_method = parsed_log_rdd.map(extract_api_method) \
    .reduceByKey(lambda c1, c2: c1 + c2) \
    .sortByKey()
rdd_count_by_api_method.take(5)

[('DELETE', 425), ('GET', 398), ('PATCH', 422), ('POST', 402), ('PUT', 413)]

In [13]:
# c-2) 분 단위 별 요청 횟수 출력.
def extract_hour_and_minute(row: List[str]) -> tuple[str, int]:
    timestamp = row[1].replace("[", "").replace("]", "")
    date_format = "%d/%b/%Y:%H:%M:%S"
    date_time_obj = datetime.strptime(timestamp, date_format)
    return f"{date_time_obj.hour}:{date_time_obj.minute}", 1

rdd_count_by_minute = parsed_log_rdd.map(extract_hour_and_minute) \
    .reduceByKey(lambda c1, c2: c1 + c2) \
    .sortByKey()
rdd_count_by_minute.take(5)

[('5:16', 790), ('5:17', 1270)]

In [14]:
# d) group by
# d-1) status code, api method 별 ip 리스트 출력
def extract_cols(row: List[str]) -> tuple[str, str, str]:
    ip = row[0]
    status_code = row[3]
    api_log = row[2].replace("\"", "")
    api_method = api_log.split(" ")[0]

    return status_code, api_method, ip

In [17]:
# reduceByKey 사용
parsed_log_rdd.map(extract_cols)\
    .map(lambda x: ((x[0], x[1]), x[2]))\
    .reduceByKey(lambda i1, i2: f"{i1},{i2}") \
    .map(lambda row: (row[0], row[1].split(","))) \
    .take(5)

# groupby 사용
# parsed_log_rdd.map(extract_cols) \
#     .map(lambda x: ((x[0], x[1]), x[2])) \
#     .groupByKey().mapValues(list)  
# 큰 데이터 셋에서 groupByKey를 하는 것은 성능적으로 좋지 않음.

[(('404', 'PATCH'),
  ['199.211.153.193',
   '37.40.113.2',
   '218.136.33.248',
   '109.159.6.150',
   '46.50.10.162',
   '237.56.33.66',
   '49.6.197.31',
   '245.75.205.95',
   '134.218.155.104',
   '99.169.72.186',
   '239.63.125.6',
   '172.221.140.80',
   '10.159.229.118',
   '30.164.49.160',
   '133.66.54.193',
   '39.25.215.72',
   '85.230.193.254',
   '87.232.54.105',
   '16.173.170.138',
   '146.86.193.13',
   '104.191.159.183',
   '101.138.249.37',
   '163.140.238.159',
   '35.232.156.24',
   '190.87.101.116',
   '63.149.213.232',
   '161.234.173.246',
   '52.28.187.86',
   '220.145.104.243',
   '121.94.192.236',
   '133.244.241.220',
   '228.143.232.104',
   '250.227.242.128',
   '138.212.74.115']),
 (('301', 'PATCH'),
  ['44.123.5.41',
   '115.35.93.146',
   '15.135.230.12',
   '118.86.206.220',
   '207.0.19.36',
   '207.100.94.56',
   '20.14.135.106',
   '137.155.111.167',
   '112.145.249.114',
   '141.225.137.101',
   '225.172.207.59',
   '29.144.114.89',
   '229.148.230