<a href="https://colab.research.google.com/github/emelyanov777/test/blob/main/final_task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# pip install faker

In [2]:
import csv
from faker import Faker
import random

fake = Faker()

num_records = 100000

http_methods = ['GET', 'POST', 'PUT', 'DELETE']
response_codes = [200, 301, 404, 500]

file_path = "web_server_logs.csv"

with open(file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['ip', 'timestamp', 'method', 'url', 'response_code', 'response_size'])

    for _ in range(num_records):
        ip = fake.ipv4()
        timestamp = fake.date_time_this_year().isoformat()
        method = random.choice(http_methods)
        url = fake.uri_path()
        response_code = random.choice(response_codes)
        response_size = random.randint(100, 10000)

        writer.writerow([ip, timestamp, method, url, response_code, response_size])

print(f"Сгенерировано {num_records} записей и сохранено в {file_path}")

Сгенерировано 100000 записей и сохранено в web_server_logs.csv


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [4]:
spark = SparkSession.builder.getOrCreate()

In [5]:
df = spark.read.csv('/content/web_server_logs.csv', header = True, inferSchema= True)

In [6]:
print('Top 10 active IP addreses')
df.groupBy(day('timestamp')).agg(count('*').alias('request_count')).orderBy('request_count', ascending = False).limit(10).show()

Top 10 active IP addreses
+--------------+-------------+
|            ip|request_count|
+--------------+-------------+
| 175.67.147.18|            1|
|212.50.148.101|            1|
| 63.253.52.193|            1|
|173.92.211.179|            1|
|25.251.126.244|            1|
| 217.91.65.246|            1|
|197.192.42.149|            1|
| 99.234.18.124|            1|
| 162.2.161.224|            1|
| 33.106.72.223|            1|
+--------------+-------------+



In [7]:
print('Request count by HTTP method:')
df.groupBy('method').agg({'method': 'count'}).withColumnRenamed('count(method)', 'method_count').show()

Request count by HTTP method:
+------+------------+
|method|method_count|
+------+------------+
|  POST|       25063|
|DELETE|       25155|
|   PUT|       24942|
|   GET|       24840|
+------+------------+



In [8]:
k = df.filter(col('response_code') == 404).count()
print(f'Number of 404 response codes: {k}')

Number of 404 response codes: 24949


In [17]:
df.withColumn("date", date_format(col("timestamp"), "yyyy-MM-dd"))\
.groupBy("date").agg({"response_size": "sum"}).withColumnRenamed("sum(response_size)", "total_response_size")\
.orderBy(col("date")).show()

+----------+-------------------+
|      date|total_response_size|
+----------+-------------------+
|2025-01-01|            2343857|
|2025-01-02|            2477297|
|2025-01-03|            2345592|
|2025-01-04|            2398211|
|2025-01-05|            2534128|
|2025-01-06|            2456092|
|2025-01-07|            2531894|
|2025-01-08|            2234878|
|2025-01-09|            2231241|
|2025-01-10|            2503333|
|2025-01-11|            2268805|
|2025-01-12|            2455301|
|2025-01-13|            2349428|
|2025-01-14|            2531524|
|2025-01-15|            2522620|
|2025-01-16|            2640408|
|2025-01-17|            2475075|
|2025-01-18|            2401956|
|2025-01-19|            2415846|
|2025-01-20|            2457923|
+----------+-------------------+
only showing top 20 rows



In [18]:
spark.stop()