In [23]:
import csv
import random
from faker import Faker
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date

In [2]:
fake = Faker()

num_records = 100000

http_methods = ['GET', 'POST', 'PUT', 'DELETE']
response_codes = [200, 301, 404, 500]

file_path = "web_server_logs.csv"

with open(file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['ip', 'timestamp', 'method', 'url', 'response_code', 'response_size'])
    
    for _ in range(num_records):
        ip = fake.ipv4()
        timestamp = fake.date_time_this_year().isoformat()
        method = random.choice(http_methods)
        url = fake.uri_path()
        response_code = random.choice(response_codes)
        response_size = random.randint(100, 10000)
        
        writer.writerow([ip, timestamp, method, url, response_code, response_size])

print(f"Сгенерировано {num_records} записей и сохранено в {file_path}")

Сгенерировано 100000 записей и сохранено в web_server_logs.csv


In [4]:
spark = SparkSession.builder \
    .master("local[2]") \
    .appName("MyLocalSparkApp") \
    .config("spark.driver.host", "127.0.0.1") \
    .getOrCreate()

In [5]:
df = spark.read.csv('web_server_logs.csv', header=True, inferSchema=True)

In [6]:
df.printSchema()

root
 |-- ip: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- method: string (nullable = true)
 |-- url: string (nullable = true)
 |-- response_code: integer (nullable = true)
 |-- response_size: integer (nullable = true)



In [8]:
df.na.drop().count() == df.count()

True

In [19]:
df.groupBy('ip').agg({'ip':'count'}).orderBy(col('count(ip)').desc()) \
  .withColumnRenamed('count(ip)', 'request_count').show(10)

+--------------+-------------+
|            ip|request_count|
+--------------+-------------+
|  139.5.91.252|            2|
|  52.158.83.23|            1|
| 65.201.178.68|            1|
| 212.3.209.127|            1|
|  152.31.3.179|            1|
|  155.91.95.29|            1|
| 217.79.198.91|            1|
| 135.186.20.41|            1|
|129.105.233.49|            1|
|19.160.216.242|            1|
+--------------+-------------+
only showing top 10 rows


In [20]:
df.groupBy('method').agg({'method': 'count'}).withColumnRenamed('count(method)', 'method_count').show(10)

+------+------------+
|method|method_count|
+------+------------+
|  POST|       24701|
|DELETE|       25121|
|   PUT|       24982|
|   GET|       25196|
+------+------------+



In [21]:
df.filter(col('response_code') == 404).count()

25083

In [26]:
df.groupBy(to_date(col('timestamp'))).agg({'response_size': 'sum'}) \
   .withColumnRenamed('to_date(timestamp)', 'date') \
   .withColumnRenamed('sum(response_size)', 'total_response_size').show()

+----------+-------------------+
|      date|total_response_size|
+----------+-------------------+
|2025-02-16|            2833020|
|2025-06-08|            2988139|
|2025-02-15|            2584440|
|2025-02-01|            2876680|
|2025-03-23|            2839793|
|2025-04-17|            2778400|
|2025-02-05|            2699617|
|2025-02-13|            2777407|
|2025-05-13|            2958978|
|2025-05-15|            2801883|
|2025-03-16|            2769734|
|2025-02-06|            2839418|
|2025-01-09|            2825174|
|2025-02-12|            2649551|
|2025-05-30|            2946840|
|2025-04-21|            2764348|
|2025-06-05|            2856733|
|2025-03-03|            2766850|
|2025-02-22|            2917399|
|2025-01-14|            2740042|
+----------+-------------------+
only showing top 20 rows
