In [1]:
!pip install faker
!pip install pyspark


Collecting faker
  Downloading Faker-26.3.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-26.3.0-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-26.3.0
Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488493 sha256=69038c15df4a9448c406cbcddfcf1537ceab94a6087db16b314269db5c899c00
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packa

In [28]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date
from pyspark.sql.types import IntegerType

spark = SparkSession.builder.appName("IP_analytics").getOrCreate()

df = spark.read.csv("/content/web_server_logs.csv", header=True)
df = df.withColumn("timestamp", to_date(col("timestamp")))
df.createOrReplaceTempView("df")

most_active_ip = spark.sql("""
SELECT ip, count(*) as request_count
FROM df
GROUP BY ip
ORDER BY request_count desc
LIMIT 10
""")

count_by_method = spark.sql("""
SELECT method, count(method) as method_count
FROM df
GROUP BY method
ORDER BY method_count DESC
""")

count_404 = spark.sql("""
SELECT count(response_code) as count
FROM df
WHERE response_code = 404
GROUP BY response_code
""").collect()

sum_by_date = spark.sql("""
SELECT timestamp as date, sum(response_size) OVER (PARTITION BY timestamp) as total_response_size
FROM df
ORDER BY date
""")
sum_by_date = sum_by_date.withColumn("total_response_size", sum_by_date["total_response_size"].cast(IntegerType()))

print("Top 10 active IP addresses:")
most_active_ip.show()
print("Request count by HTTP method")
count_by_method.show()
print("Number of 404 response codes:", count_404[-1][-1])
print("Total response size by day:")
sum_by_date.show()
spark.stop()

Top 10 active IP addresses:
+---------------+-------------+
|             ip|request_count|
+---------------+-------------+
|  198.15.43.225|            2|
| 221.91.221.118|            2|
|   64.88.67.182|            2|
| 55.169.155.129|            2|
|183.107.197.133|            2|
|176.156.215.151|            1|
|   46.81.111.63|            1|
| 129.16.217.126|            1|
|  199.182.41.82|            1|
| 49.154.186.213|            1|
+---------------+-------------+

Request count by HTTP method
+------+------------+
|method|method_count|
+------+------------+
|   PUT|       25097|
|  POST|       25084|
|DELETE|       25008|
|   GET|       24811|
+------+------------+

Number of 404 response codes: 24894
Total response size by day:
+----------+-------------------+
|      date|total_response_size|
+----------+-------------------+
|2024-01-01|            2402878|
|2024-01-01|            2402878|
|2024-01-01|            2402878|
|2024-01-01|            2402878|
|2024-01-01|          