In [5]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.cluster import KMeans
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import os

# Mount Google Drive
drive.mount('/content/drive')

# Path of folder
folder_path = '/content/drive/MyDrive/Colab Notebooks/2024_08_01 for all sites kmeans site id wise last 3 months /'

# Get list of all CSV files in the folder
csv_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.csv')]

# Initialize an empty list to hold the dataframes
dfs = []

# Loop through the CSV files and read each one into a dataframe
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate all dataframes into a single dataframe
data = pd.concat(dfs, ignore_index=True)

# Sort by 'First Occurred On' in descending order (latest alarms first)
df = data.sort_values(by='First Occurred On', ascending=False)

# Convert 'First Occurred On' to datetime, handle errors and fill NaT with a default date
df['First Occurred On'] = pd.to_datetime(df['First Occurred On'], errors='coerce')

# Reference time
reference_time = pd.to_datetime('2024-08-01 22:00:00')

# Calculate the relative day index, handle NaT by filling with a large negative number
df['Relative Day Index'] = ((df['First Occurred On'] - reference_time).dt.total_seconds() // 86400).fillna(-9999).astype(int)

# Adjust the index to set the 08:00 to 08:00 of the next day as 0, and previous days as -1, -2, -3, etc.
df['Relative Day Index'] = df['Relative Day Index'].apply(lambda x: x if x < 0 else x)

# Print the updated dataframe
print(df.head(50000))

# Filter the data for HUAWEI and Access domain
#filtered_data = df[(df['Vendor'] == 'HUAWEI')]

# Retain only the specified columns
columns_to_keep = ['Site ID', 'Alarm Name', 'Vendor', 'Domain', 'Device Type', 'Relative Day Index']
df_filtered = df[columns_to_keep]

# # Identify the five most recent Site IDs
# recent_site_ids = df_filtered['Site ID'].value_counts().index[:10]

# # Filter the dataframe to include only the most recent Site IDs
# df_filtered = df_filtered[df_filtered['Site ID'].isin(recent_site_ids)]

# Encode categorical columns except 'Relative Day Index'
categorical_columns = df_filtered.select_dtypes(include=['object']).columns
label_encoders = {}
for column in categorical_columns:
    le = LabelEncoder()
    df_filtered[column] = le.fit_transform(df_filtered[column].astype(str))
    label_encoders[column] = le

# Print the updated dataframe
df_filtered.head(100)


# Clustering to find common patterns
# Determine the optimal number of clusters using the elbow method
# wcss = []
# for i in range(1, 21):
# kmeans = KMeans(n_clusters=i, random_state=42)
# kmeans.fit(df_filtered)
# wcss.append(kmeans.inertia_)

# plt.plot(range(1, 21), wcss)
# plt.title('Elbow Method for Optimal k')
# plt.xlabel('Number of clusters')
# plt.ylabel('WCSS')
# plt.show()

# Fit the KMeans model with the optimal number of clusters
optimal_clusters = 20  # Increased number of clusters
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
df_filtered['Cluster'] = kmeans.fit_predict(df_filtered[['Alarm Name', 'Site ID']])

# # Now filter for individual site IDs after clustering
# for site_id in most_alarms_site_ids:
#     past_alarms1 = df_filtered[df_filtered['Site ID'] == site_id]

# Analyze the clusters
print(df_filtered.groupby('Cluster').mean())
print(df_filtered.groupby('Cluster').size())

# Print cluster details and alarm patterns
cluster_patterns = {}
for cluster in range(optimal_clusters):
    print(f"Cluster {cluster} details:")
    cluster_data = df_filtered[df_filtered['Cluster'] == cluster]
    alarm_patterns = cluster_data.groupby(['Alarm Name', 'Site ID']).size()
    cluster_patterns[cluster] = alarm_patterns
    print("Alarm Patterns:")
    print(alarm_patterns)
    print("\n")

# Plot the clusters
# sns.pairplot(df_filtered, hue='Cluster')
# plt.show()

# Inverse transform the scaled columns for interpretation
# scaler = MinMaxScaler()
# df_filtered[['Alarm Name', 'Site ID', 'Vendor', 'Domain', 'Device Type', 'Relative Day Index', 'Cluster']] = scaler.inverse_transform(df_filtered[['Alarm Name', 'Site ID', 'Vendor', 'Domain', 'Device Type', 'Relative Day Index', 'Cluster']])

# Convert back to original categorical values using label_encoders
# for column in label_encoders.keys():
#     df_filtered[column] = label_encoders[column].inverse_transform(df_filtered[column].round().astype(int))


# Re-scale the dataframe for LSTM
scaler = MinMaxScaler()
df_filtered[['Alarm Name', 'Site ID', 'Vendor', 'Domain', 'Device Type', 'Relative Day Index', 'Cluster']] = scaler.fit_transform(df_filtered[['Alarm Name', 'Site ID', 'Vendor', 'Domain', 'Device Type', 'Relative Day Index', 'Cluster']])

# Prepare the dataset for LSTM
def create_dataset(data, time_steps=1):
    X, y = [], []
    for i in range(len(data) - time_steps):
        X.append(data[i:(i + time_steps)])
        y.append(data[i + time_steps])
    return np.array(X), np.array(y)

time_steps = 10
features = df_filtered[['Alarm Name', 'Site ID', 'Vendor', 'Domain', 'Device Type', 'Relative Day Index', 'Cluster']].values
X, y = create_dataset(features, time_steps)

# Reshape input to be [samples, time steps, features]
X = X.reshape(X.shape[0], time_steps, X.shape[2])

# Define the LSTM model
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(time_steps, X.shape[2])))
model.add(LSTM(50))
model.add(Dense(X.shape[2]))
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(X, y, epochs=1, batch_size=2048, verbose=1)

# Predict for the next 7 days
predictions = []
current_batch = features[-time_steps:].reshape((1, time_steps, X.shape[2]))

for i in range(24 * 7):  # 24 hours * 7 days
    pred = model.predict(current_batch, verbose=0)[0]
    predictions.append(pred)
    current_batch = np.append(current_batch[:, 1:, :], [[pred]], axis=1)

# Inverse transform the predictions
predictions = scaler.inverse_transform(predictions)

# Create a DataFrame for predictions
relative_day_indices = np.arange(df_filtered['Relative Day Index'].max() + 1, df_filtered['Relative Day Index'].max() + 1 + 24 * 7)
pred_df = pd.DataFrame(predictions, columns=['Alarm Name', 'Site ID', 'Vendor', 'Domain', 'Device Type', 'Relative Day Index', 'Cluster'])
pred_df['Relative Day Index'] = relative_day_indices

# Convert back to original categorical values using label_encoders
for column in label_encoders.keys():
    pred_df[column] = pred_df[column].round().astype(int)
    pred_df[column] = label_encoders[column].inverse_transform(pred_df[column])

# Filter predictions for the next 7 days only
pred_df = pred_df[pred_df['Relative Day Index'] <= 10]

# List of desired hardware faults
desired_faults = [
    'Board Hardware Fault',
    'Radio Signaling Link Disconnected',
    'Cell Capability Degraded',
    'RF Unit CPRI Interface Error',
    'RF Unit VSWR Threshold Crossed',
    'RF Unit Maintenance Link Failure',
    'BBU CPRI Interface Error',
    'RF Unit External Power Supply Insufficient',
    'RF Unit TX Channel Gain Out of Range',
    'RF Unit Hardware Fault',
    'RF Unit DC Input Power Failure',
    'Monitoring Device Maintenance Link Failure',
    'Inter-Board Service Link Failure',
    'BBU Fan Stalled',
    'RF Unit Optical Module Fault',
    'Power Supply Insufficient for Multiple RF Units',
    'RF Unit AC Input Power Failure',
    'Cell RX Channel Interference Noise Power Unbalanced',
    'MAC Excessive Frame Error Rate',
    'Transmission Optical Interface Error',
    'BSL Fault',
    'GNSS Antenna Fault',
    'RF Unit RX Channel RTWP/RSSI Too Low',
    'RF Out of Service',
    'RF Unit Clock Problem',
    'Inter-System Communication Failure',
    'RF Unit Baseband Running Error',
    'BBU Board Maintenance Link Failure',
    'GNSS Locked Satellites Insufficient',
    'Board Temperature Unacceptable',
    'RF Unit Temperature Unacceptable',
    'RHUB CPRI Interface Error',
    'Inter-Board CANBUS Communication Failure',
    'Monitoring Device Power Supply Problem',
    'RF Unit Software Program Error',
    'RF Unit Input Power Out of Range',
    'RHUB-pRRU CPRI Interface Error',
    'RHUB Unit AC Input Power Failure',
    'NR DU Cell TRP Unavailable',
    'BBU CPRI Optical Module Fault',
    'Board Input Voltage Out of Range',
    'Board Powered Off',
    'Board Unavailable',
    'BBU CPRI Line Rate Negotiation Abnormal',
    'Transmission Optical Module Fault'
]

# Filter predictions to include only the desired hardware faults
pred_df = pred_df[pred_df['Alarm Name'].isin(desired_faults)]

# Display sample output of predictions with cluster details
print(pred_df.head(7))

# Print the cluster patterns for the predicted alarms
for index, row in pred_df.iterrows():
    cluster = int(row['Cluster'])
    print(f"Predicted Alarm for Relative Day Index {row['Relative Day Index']}:")
    print(f"Cluster {cluster} Alarm Patterns:")
    alarm_names = cluster_patterns[cluster].index.get_level_values(0).map(lambda x: label_encoders['Alarm Name'].inverse_transform([x])[0])
    print(pd.Series(cluster_patterns[cluster].values, index=alarm_names))
    print("\n")

# Print cluster-wise alarm patterns
for cluster in range(optimal_clusters):
    print(f"Cluster {cluster} Alarm Patterns:")
    alarm_names = cluster_patterns[cluster].index.get_level_values(0).map(lambda x: label_encoders['Alarm Name'].inverse_transform([x])[0])
    print(pd.Series(cluster_patterns[cluster].values, index=alarm_names))
    print("\n")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  df = pd.read_csv(file)


            Last Occurred On                   Alarm Source Alarm Severity  \
3864384  2024-08-01 23:03:12               Badulla-BD0001-L          Major   
3864385  2024-08-01 23:02:44  MC1_Mundal_South-PU0271-BDGHP       Critical   
3864386  2024-08-01 23:02:36                         vUSN02          Major   
3864387  2024-08-01 23:02:30                    PILI-AMF-01          Major   
3864388  2024-08-01 23:02:27                    PILI_VUSN01          Major   
...                      ...                            ...            ...   
47722    2024-07-31 12:06:24                       U2020ACC          Major   
47723    2024-07-31 12:06:23                       U2020ACC          Major   
47724    2024-07-31 12:06:21                    PILI-AMF-01          Major   
47725    2024-07-31 12:06:17                    PILI_VUSN01          Major   
47726    2024-07-31 12:06:17                      KALA_SGSN          Major   

                                Alarm Name Alarm ID  \
3864384 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[column] = le.fit_transform(df_filtered[column].astype(str))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[column] = le.fit_transform(df_filtered[column].astype(str))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[column] = le.fit_transform(df_filtered[column].as

             Site ID  Alarm Name    Vendor    Domain  Device Type  \
Cluster                                                             
0        4122.751526  820.287212  0.000000  3.322170    27.331960   
1         634.580214  573.160241  0.000000  2.336845    16.958291   
2        2986.207615  532.623538  0.000013  3.393552    20.089433   
3        4102.531612  184.027275  0.000008  2.158018    22.268537   
4        1339.588648  148.601242  0.000131  1.266130    25.849728   
5        3612.966733  783.011030  0.000000  2.663943    21.490390   
6         228.961570  192.636514  0.000030  1.605603    24.566036   
7        2978.795190  162.559887  0.000011  1.441736    25.436575   
8        1360.604562  790.787002  0.000000  2.270422    22.083213   
9         887.059479  581.411333  0.000009  2.304367    16.580215   
10        647.387287  852.052656  0.000000  2.211431    20.846583   
11       1866.080935  821.502907  0.000000  2.591874    17.640964   
12       1840.599678  199.605572  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[['Alarm Name', 'Site ID', 'Vendor', 'Domain', 'Device Type', 'Relative Day Index', 'Cluster']] = scaler.fit_transform(df_filtered[['Alarm Name', 'Site ID', 'Vendor', 'Domain', 'Device Type', 'Relative Day Index', 'Cluster']])
  super().__init__(**kwargs)


[1m1897/1897[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m335s[0m 175ms/step - loss: 0.0615
              Alarm Name Site ID  Vendor Domain Device Type  \
1  RF Unit Clock Problem  KU0445  HUAWEI   NFVI        UPCC   

   Relative Day Index   Cluster  
1                 3.0  3.260628  
Predicted Alarm for Relative Day Index 3.0:
Cluster 3 Alarm Patterns:
Alarm Name
A Certificate Is About to Expire      4
A Service Instance Is Not Ready     579
A Weak Algorithm in SNMP Is Used      1
AC Surge Protector Fault              2
ACPI is in the soft-off state         1
                                   ... 
External Clock Reference Problem      6
External Clock Reference Problem      9
External Clock Reference Problem      3
External Clock Reference Problem      2
External Clock Reference Problem      2
Length: 1646, dtype: int64


Cluster 0 Alarm Patterns:
Alarm Name
NTP Synchronization Failure                        4
NTP connection failed                             30
NTP service abno

In [2]:
# Save the predictions to a CSV file
pred_df.to_csv('/content/drive/MyDrive/Colab Notebooks/2024_08_01 LSTM_predictions.csv', index=False)


In [3]:
import joblib

# File paths
le_alarm_name_path = '/content/drive/MyDrive/Colab Notebooks/label_encoders/le_alarm_name.pkl'
le_domain_path = '/content/drive/MyDrive/Colab Notebooks/label_encoders/le_domain.pkl'
le_site_id_path = '/content/drive/MyDrive/Colab Notebooks/label_encoders/le_site_id.pkl'
le_vendor_path = '/content/drive/MyDrive/Colab Notebooks/label_encoders/le_vendor.pkl'
le_corrective_action_path = '/content/drive/MyDrive/Colab Notebooks/label_encoders/le_corrective_action.pkl'

# Load the label encoders
try:
    le_alarm_name = joblib.load(le_alarm_name_path)
    le_domain = joblib.load(le_domain_path)
    le_site_id = joblib.load(le_site_id_path)
    le_vendor = joblib.load(le_vendor_path)
    le_corrective_action = joblib.load(le_corrective_action_path)
    print("Label encoders loaded successfully.")
except FileNotFoundError as e:
    print(f"FileNotFoundError: {e}")
except Exception as e:
    print(f"An error occurred: {e}")


Label encoders loaded successfully.


In [4]:
# Ensure necessary libraries are installed
!pip install flaml pandas scikit-learn joblib

# Import necessary libraries
import pandas as pd
from flaml import AutoML
import joblib

# Load the CSV file with the prediction data
csv_file_path = '/content/drive/MyDrive/Colab Notebooks/2024_08_01 LSTM_predictions.csv'
prediction_df = pd.read_csv(csv_file_path)

# Display the loaded data
print(prediction_df.head(10))


# Encode the prediction data using the same encoders
prediction_df['Alarm Name'] = le_alarm_name.transform(prediction_df['Alarm Name'])
prediction_df['Domain'] = le_domain.transform(prediction_df['Domain'])

# Define features (X) for prediction
X_prediction = prediction_df[['Alarm Name', 'Domain']]

# Load the saved FLAML model
saved_model_path = '/content/drive/MyDrive/Colab Notebooks/2024_08_01 FLAML_model.pkl'
automl = joblib.load(saved_model_path)

# Predict the Corrective Action for the loaded prediction data
predictions_corrective_actions = automl.predict(X_prediction)

# Decode the predicted labels back to the original categories
predicted_corrective_actions = le_corrective_action.inverse_transform(predictions_corrective_actions)

# Add predictions to the DataFrame
prediction_df['Predicted Corrective Action'] = predicted_corrective_actions

# Display the predictions
prediction_df.head(10)

# Optionally, save the predictions to a CSV file
output_file_path = '/content/drive/MyDrive/Colab Notebooks/2024_08_01_predictions_with_corrective_actions.csv'
prediction_df.to_csv(output_file_path, index=False)





Collecting flaml
  Downloading FLAML-2.1.2-py3-none-any.whl.metadata (15 kB)
Downloading FLAML-2.1.2-py3-none-any.whl (296 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/296.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.7/296.7 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: flaml
Successfully installed flaml-2.1.2


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



Empty DataFrame
Columns: [Alarm Name, Site ID, Vendor, Domain, Device Type, Relative Day Index, Cluster]
Index: []


ValueError: Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required by SimpleImputer.