In [None]:

!pip install pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, countDistinct

spark = (SparkSession.builder.appName("itoss-ai")
         .config("spark.jars","./postgresql-42.7.3.jar")
         .config("spark.sql.debug.maxToStringFields", "1024")
         .getOrCreate())

In [None]:
jdbcUrl = "jdbc:postgresql://localhost:5432/itossprd280824"
properties = {
    "driver": "org.postgresql.Driver",
    "user": "itoss",
    "password": "admin"
}

In [92]:
cts_df = spark.read.jdbc(url=jdbcUrl, table="""
(SELECT
                     c.id,
                     c.key,
                     c.environment,
                     c.state,
                     c.name,
                     c.collector_id,
                     c.company_id,
                     c.contact_id,
                     c.location_id,
                     c.monitoring_profile_id,
                     c.support_user_id,
                     c.type_id,
                     c.workgroup_id,
                     CASE
                         WHEN environment = 'PRODUCTION' THEN 'production'
                         ELSE 'nonProduction'
                     END AS environment_group,
                     ct.name AS type_name,
                     ct.type_path AS type_path,
                     co.name AS company_name,
                     w.name AS workgroup_name,
                     l.name AS location_name,
                     mp.name AS monitoring_profile_name,
                     cl.name AS collector_name,
                     ARRAY_AGG(tu.users_id::BIGINT) AS users_id
                 FROM
                     ct c
                 INNER JOIN
                     workgroup w ON c.workgroup_id = w.id
                 INNER JOIN
                     location l ON c.location_id = l.id
                 INNER JOIN
                     ct_type ct ON c.type_id = ct.id
                 INNER JOIN
                     company co ON c.company_id = co.id
                 INNER JOIN
                     collector cl ON c.collector_id = cl.id
                 INNER JOIN
                     monitoring_profile mp ON c.monitoring_profile_id = mp.id
                 INNER JOIN
                     tennant_cts tc ON c.id = tc.cts_id
                 INNER JOIN
                     tennant_users tu ON tc.tennant_id = tu.tennant_id
                 GROUP BY
                     c.id,
                     c.key,
                     c.environment,
                     c.state,
                     c.name,
                     c.collector_id,
                     c.company_id,
                     c.contact_id,
                     c.location_id,
                     c.monitoring_profile_id,
                     c.support_user_id,
                     c.type_id,
                     c.workgroup_id,
                     ct.name,
                     ct.type_path,
                     co.name,
                     w.name,
                     l.name,
                     mp.name,
                     cl.name
                 ) as subquery""", properties=properties)

In [102]:
from pyspark.sql import functions
cts_df = cts_df.filter("state = 'OPERATIONS'").filter(functions.array_contains(functions.col("users_id"), 200000)).select("id","environment_group")

In [103]:
status_delta_df = spark.read.jdbc(url=jdbcUrl, table="""
        (SELECT timestamp, ct_id, status FROM ct_status_delta where timestamp > CURRENT_TIMESTAMP - INTERVAL '7 month') as subquery
        """, properties=properties)

In [104]:
status_df = spark.read.jdbc(url=jdbcUrl, table="""
        (SELECT modified_at AS last_status, last_status_change, down AS status_down, 
        last_status_change AS timestamp, id AS ct_id, CASE WHEN down THEN 'down' ELSE 'up' END AS status 
        FROM ct_status) as subquery
        """, properties=properties)

In [105]:
all_status_df = status_delta_df.union(status_df
                .select("timestamp", "ct_id", "status")).orderBy("ct_id", "timestamp", "ct_id").distinct();

In [106]:
joined_ct_status = cts_df.join(status_df, cts_df.id == status_df.ct_id, "inner").select("ct_id","environment_group", "status", "timestamp")

In [107]:

total_status_now = joined_ct_status.groupBy("status").count().show()



+------+-----+
|status|count|
+------+-----+
|  down|  922|
|    up|10353|
+------+-----+



In [86]:
from pyspark.sql.functions import expr, col, when, lit, row_number
from pyspark.sql.window import Window

# Define time periods using SQL expressions
last_hour = expr("current_timestamp() - INTERVAL 1 HOUR")
last_day = expr("current_timestamp() - INTERVAL 1 DAY")
last_week = expr("current_timestamp() - INTERVAL 1 WEEK")
last_month = expr("current_timestamp() - INTERVAL 1 MONTH")

# Create period column based on the defined time periods
periodCol = (when(col("timestamp")>last_hour, lit("now"))
             .when((col("timestamp")>last_day) & (col("timestamp")<=last_hour), lit("last_hour"))
             .when((col("timestamp")>last_week) & (col("timestamp")<=last_day), lit("last_day"))
             .when((col("timestamp")>last_month) & (col("timestamp")<=last_week), lit("last_week"))
             .otherwise(lit("last_month")))

# Add the period column to your DataFrame
categorizedData = joined_ct_status.withColumn("period", periodCol)

In [91]:
categorizedData.filter(col("period") == "last_hour").show(10, truncate=False)

+------+-----------------+------+-----------------------+---------+
|ct_id |environment_group|status|timestamp              |period   |
+------+-----------------+------+-----------------------+---------+
|216482|production       |up    |2024-08-27 19:09:28.123|last_hour|
|213810|production       |up    |2024-08-28 05:22:31.729|last_hour|
|204155|production       |up    |2024-08-28 05:22:31.816|last_hour|
|213673|production       |up    |2024-08-28 03:22:33.624|last_hour|
|210040|production       |up    |2024-08-28 05:22:31.725|last_hour|
|200620|production       |up    |2024-08-27 23:58:48.427|last_hour|
|204104|production       |up    |2024-08-28 05:22:31.544|last_hour|
|209117|production       |up    |2024-08-28 00:48:47.553|last_hour|
|209056|production       |up    |2024-08-28 10:06:45.664|last_hour|
|212834|production       |up    |2024-08-28 11:03:31.324|last_hour|
+------+-----------------+------+-----------------------+---------+
only showing top 10 rows



In [51]:

# Define window specification to get the most recent record for each `ct_id`
window_spec = Window.partitionBy("ct_id").orderBy(col("timestamp").desc())
df_with_rn = categorizedData.withColumn("row_number", row_number().over(window_spec))

# Filter to keep only the first row (most recent) for each `ct_id`
df_filtered = df_with_rn.filter(col("row_number") == 1)
df_filtered.groupBy("period", "status").count().show()
# Count the records grouped by period



+----------+------+-----+
|    period|status|count|
+----------+------+-----+
| last_week|    up| 3365|
|  last_day|  down|   36|
| last_hour|  down|   12|
|  last_day|    up|  698|
|last_month|    up| 6081|
| last_week|  down|  116|
| last_hour|    up|  211|
|last_month|  down|  758|
+----------+------+-----+



In [53]:
df_filtered.filter("period = 'now'").show()

                                                                                

+---+---+-----------+-----+----+------------+----------+----------+-----------+---------------------+---------------+-------+------------+-----------------+---------+---------+------------+--------------+-------------+-----------------------+--------------+--------+---------+-----+------+------+----------+
| id|key|environment|state|name|collector_id|company_id|contact_id|location_id|monitoring_profile_id|support_user_id|type_id|workgroup_id|environment_group|type_name|type_path|company_name|workgroup_name|location_name|monitoring_profile_name|collector_name|users_id|timestamp|ct_id|status|period|row_number|
+---+---+-----------+-----+----+------------+----------+----------+-----------+---------------------+---------------+-------+------------+-----------------+---------+---------+------------+--------------+-------------+-----------------------+--------------+--------+---------+-----+------+------+----------+
+---+---+-----------+-----+----+------------+----------+----------+---------