In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

In [4]:
data = pd.read_csv("data/insights_metrics.csv")

In [8]:
# Handle missing values
data.fillna(data.mean(), inplace=True)

# Normalize data
scaler = StandardScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)

In [9]:
# Calculate correlation matrix
corr_matrix = data_scaled.corr()

# Select highly correlated features
highly_correlated_features = corr_matrix.columns[abs(corr_matrix) > 0.8]

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [None]:
# Perform K-Means clustering
kmeans = KMeans(n_clusters=3)
kmeans.fit(data_scaled)

# Evaluate clusters using Silhouette Score
silhouette_avg = silhouette_score(data_scaled, kmeans.labels_)
print("Silhouette Score:", silhouette_avg)

In [None]:
# Perform PCA
pca = PCA(n_components=2)
data_pca = pca.fit_transform(data_scaled)

# Visualize clusters using PCA
plt.scatter(data_pca[:, 0], data_pca[:, 1], c=kmeans.labels_)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("K-Means Clustering with PCA")
plt.show()

In [None]:
# Perform Isolation Forest
from sklearn.ensemble import IsolationForest

iforest = IsolationForest(contamination=0.1)
iforest.fit(data_scaled)

# Identify anomalies
anomalies = iforest.predict(data_scaled)

In [None]:
# Visualize anomalies using scatter plot
plt.scatter(data_pca[:, 0], data_pca[:, 1], c=anomalies)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("Anomaly Detection using Isolation Forest")
plt.show()