# AIOps Demonstration Notebook


In [1]:
# Step 1: Setup Environment
!pip install numpy pandas matplotlib seaborn scikit-learn statsmodels

Collecting statsmodels
  Downloading statsmodels-0.14.2-cp310-cp310-macosx_10_9_x86_64.whl (10.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
Collecting patsy>=0.5.6
  Downloading patsy-0.5.6-py2.py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.9/233.9 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: patsy, statsmodels
Successfully installed patsy-0.5.6 statsmodels-0.14.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
# Step 2: Load and Explore Data
import pandas as pd
import numpy as np

# Load your datasets
server_metrics = pd.read_csv("data/netflix_operational_metrics.csv")
business_metrics = pd.read_csv("data/netflix_business_metrics.csv")

# Display basic information and statistics
print(server_metrics.info())
print(server_metrics.describe())

print(business_metrics.info())
print(business_metrics.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10080 entries, 0 to 10079
Data columns (total 6 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Timestamp                      10080 non-null  object 
 1   CPU Utilization (%)            10080 non-null  float64
 2   Memory Utilization (%)         10080 non-null  float64
 3   Network I/O Throughput (Mbps)  10080 non-null  float64
 4   Disk I/O Throughput (MB/s)     10080 non-null  float64
 5   Server Configuration           10080 non-null  object 
dtypes: float64(4), object(2)
memory usage: 472.6+ KB
None
       CPU Utilization (%)  Memory Utilization (%)  \
count         10080.000000            10080.000000   
mean             69.975567               60.169587   
std              10.973602               10.675025   
min               0.000000                0.000000   
25%              62.757592               53.102970   
50%              69.894974   

In [4]:
# Step 3: Data Preprocessing
# Example: Fill missing values with the mean
server_metrics.fillna(
    server_metrics.select_dtypes(include="number").mean(), inplace=True
)
business_metrics.fillna(
    business_metrics.select_dtypes(include="number").mean(), inplace=True
)


# Normalize/Scale Data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
server_metrics_scaled = scaler.fit_transform(
    server_metrics.select_dtypes(include="number")
)

In [13]:
# Step 4: Data Analysis and Insights

# Ensure timestamps are aligned
server_metrics["Timestamp"] = pd.to_datetime(server_metrics["Timestamp"])
business_metrics["Timestamp"] = pd.to_datetime(business_metrics["Timestamp"])

# Merge dataframes on Timestamp
merged_data = pd.merge(
    server_metrics, business_metrics, on="Timestamp", suffixes=("_server", "_business")
)

# Clustering Analysis
from sklearn.cluster import KMeans

# Selecting numeric columns for clustering (excluding Timestamp and non-numeric columns)
numeric_columns = merged_data.select_dtypes(include=[float]).columns
server_metrics_scaled = scaler.fit_transform(merged_data[numeric_columns])

# Example: KMeans clustering
kmeans = KMeans(n_clusters=3)
server_clusters = kmeans.fit_predict(server_metrics_scaled)

# Add cluster labels to the merged dataframe
merged_data["Cluster"] = server_clusters

# Correlation Analysis
# Select only numeric columns for correlation
numeric_columns_server = [
    col
    for col in merged_data.columns
    if "_server" in col and merged_data[col].dtype in [np.float64, np.int64]
]
numeric_columns_business = [
    col
    for col in merged_data.columns
    if "_business" in col and merged_data[col].dtype in [np.float64, np.int64]
]

# Compute the correlation matrix with all numeric columns from business metrics
correlation_matrix = merged_data[numeric_columns_server].corrwith(
    merged_data[numeric_columns_business].mean()
)
print(correlation_matrix)

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Example: Heatmap of correlations
plt.figure(figsize=(10, 8))
sns.heatmap(merged_data[numeric_columns_server].corr(), annot=True, cmap="coolwarm")
plt.title("Correlation between Server Metrics")
plt.show()

# Example: Scatter plot of clusters
plt.figure(figsize=(10, 8))
plt.scatter(
    merged_data["CPU Utilization (%)_server"],
    merged_data["Memory Utilization (%)_server"],
    c=merged_data["Cluster"],
    cmap="viridis",
)
plt.xlabel("CPU Utilization (%)")
plt.ylabel("Memory Utilization (%)")
plt.title("Server Metrics Clustering")
plt.show()

# Example: Scatter plot for correlation
plt.figure(figsize=(10, 8))
plt.scatter(
    merged_data["CPU Utilization (%)_server"],
    merged_data["Customer Satisfaction (CSAT)_business"],
)
plt.xlabel("CPU Utilization (%)")
plt.ylabel("Customer Satisfaction (CSAT)")
plt.title("CPU Utilization vs Customer Satisfaction")
plt.show()

# Example: Scatter plot for correlation
plt.figure(figsize=(10, 8))
plt.scatter(
    merged_data["Memory Utilization (%)_server"],
    merged_data["Customer Satisfaction (CSAT)_business"],
)
plt.xlabel("Memory Utilization (%)")
plt.ylabel("Customer Satisfaction (CSAT)")
plt.title("Memory Utilization vs Customer Satisfaction")
plt.show()

Series([], dtype: float64)


ValueError: zero-size array to reduction operation fmin which has no identity

<Figure size 1000x800 with 0 Axes>

In [5]:
# Step 5: Optimization Recommendations
# Example: Simple recommendations based on cluster analysis
for cluster in server_metrics['Cluster'].unique():
    cluster_data = server_metrics[server_metrics['Cluster'] == cluster]
    print(f"Cluster {cluster} Recommendation: Optimize {cluster_data.columns[cluster_data.mean().idxmin()]} for better performance.")

# Step 6: Summary and Conclusion
## Summary and Recommendations

1. **Clustering Analysis**:
   - Identified X clusters in server metrics data.
   - Cluster 0 shows high usage of CPU and memory, indicating potential for optimization in resource allocation.

2. **Correlation Analysis**:
   - Found significant correlation between `CPU_Usage` and `Revenue`, suggesting that optimizing `CPU_Usage` could improve `Revenue`.

3. **Recommendations**:
   - Focus on optimizing CPU usage in Cluster 0.
   - Investigate further into CPU_Usage to enhance Revenue performance.
