In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.cluster import KMeans
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import os

In [None]:
# Mount Google Drive
from google.colab import drive
import os
import pandas as pd

drive.mount('/content/drive')

# Path of folder
folder_path = '/content/drive/MyDrive/Colab Notebooks/2024_08_21/'

# Get list of all CSV files in the folder
csv_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.csv')]

# Initialize an empty list to hold the dataframes
dfs = []

# Loop through the CSV files and read each one into a dataframe
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate all dataframes into a single dataframe
alarm_data = pd.concat(dfs, ignore_index=True)

# Display the first few rows
alarm_data.head()


Mounted at /content/drive


Unnamed: 0,Last Occurred On,Alarm Source,Alarm Severity,Alarm Name,Alarm ID,Alarm Location Info,Site Name,Cleared On,First Occurred On,Clearance Status,Ticket ID,Site ID,Vendor,Domain,Device Type
0,2024-06-23 02:50:47,AKBAR_UGW,Major,IPPM session fault,2620,"Local IP address=172.27.24.249, VPN=VPN_S1, Pe...",Dummy,2024-06-23 02:52:32,2024-06-23 02:50:47,Cleared,,VV0000,HUAWEI,Core,Unknown
1,2024-06-23 02:50:47,U2020ACC,Major,Parallel Alarm Exceeds the Limit,832,"RuleID=1980, Source Alarm Information=""Alarm N...",,2024-06-23 03:09:40,2024-06-23 02:50:47,Cleared,,,HUAWEI,Others,
2,2024-06-23 02:50:46,Poruwadanda-KL0127-L,Major,User Plane Fault,25954,Service Type=X2,Poruwadanda,2024-06-23 03:37:01,2024-06-23 02:50:46,Cleared,,KL0127,HUAWEI,Access,eNodeB
3,2024-06-23 02:50:45,Kananvila-KL0116-L,Critical,eNodeB S1 Control Plane Transmission Interruption,29213,"eNodeB Function Name=Kananvila-KL0116-L, CN Op...",Kananvila,2024-06-23 03:26:12,2024-06-23 02:50:45,Cleared,,KL0116,HUAWEI,Access,eNodeB
4,2024-06-23 02:50:45,PILI-AMF-01,Major,S1ap Link Down,80589,"Service Instance=LINK_VNFC_999, Mobile Country...",Piliyandala,2024-06-23 03:36:39,2024-06-23 02:50:45,Cleared,,CM0091,HUAWEI,Core,UNC


In [None]:
# Keep only the necessary columns
df = alarm_data[['First Occurred On', 'Site ID', 'Alarm ID', 'Domain']]

# Convert 'First Occurred On' to datetime
df['First Occurred On'] = pd.to_datetime(df['First Occurred On'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['First Occurred On'] = pd.to_datetime(df['First Occurred On'])


In [None]:
# Check for missing values
print(df.isnull().sum())

# Drop rows with missing Site ID, Alarm ID, or Domain
df = df.dropna(subset=['Site ID', 'Alarm ID', 'Domain'])


First Occurred On          0
Site ID              1210436
Alarm ID                   0
Domain                     0
dtype: int64


In [None]:
# Create time features from 'First Occurred On'
df['Hour'] = df['First Occurred On'].dt.hour
df['Day'] = df['First Occurred On'].dt.day
df['Month'] = df['First Occurred On'].dt.month
df['Year'] = df['First Occurred On'].dt.year

# Sort the dataframe by 'First Occurred On' for creating lag features
df = df.sort_values(by=['Site ID', 'First Occurred On'])

# Creating lag features for the past 7 days
df['Alarm_Lag_1'] = df.groupby('Site ID')['Alarm ID'].shift(1)
df['Alarm_Lag_2'] = df.groupby('Site ID')['Alarm ID'].shift(2)
# You can create additional lag features (e.g., up to 7 days)


In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoders for each column
le_alarm = LabelEncoder()
le_site = LabelEncoder()
le_domain = LabelEncoder()

# Fit and transform the columns
df['Alarm ID'] = le_alarm.fit_transform(df['Alarm ID'])
df['Site ID'] = le_site.fit_transform(df['Site ID'])
df['Domain'] = le_domain.fit_transform(df['Domain'])


In [None]:
# Define training and testing periods
train_period_end = df['First Occurred On'].max() - pd.Timedelta(days=7)
test_period_end = df['First Occurred On'].max()

# Split the data into training and testing
train_data = df[df['First Occurred On'] <= train_period_end]
test_data = df[(df['First Occurred On'] > train_period_end) & (df['First Occurred On'] <= test_period_end)]


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Prepare input data for LSTM (past 7 days sequences)
def prepare_lstm_data(df, site_column, alarm_column, n_steps):
    X, y = [], []
    for site in df[site_column].unique():
        site_data = df[df[site_column] == site][alarm_column].values
        for i in range(len(site_data)):
            end_ix = i + n_steps
            if end_ix > len(site_data)-1:
                break
            X.append(site_data[i:end_ix])
            y.append(site_data[end_ix])
    return np.array(X), np.array(y)

# Prepare the data
n_steps = 7
X_train, y_train = prepare_lstm_data(train_data, 'Site ID', 'Alarm ID', n_steps)
X_test, y_test = prepare_lstm_data(test_data, 'Site ID', 'Alarm ID', n_steps)

# Reshape input for LSTM
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

# Define the LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(n_steps, 1)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

# Train the model
model.fit(X_train, y_train, epochs=20, verbose=1)

# Make predictions
y_pred = model.predict(X_test)


  super().__init__(**kwargs)


Epoch 1/20
[1m77384/77384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m406s[0m 5ms/step - loss: 22913.1562
Epoch 2/20
[1m77384/77384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m433s[0m 5ms/step - loss: 21765.9297
Epoch 3/20
[1m77384/77384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m451s[0m 5ms/step - loss: 22694.1348
Epoch 4/20
[1m77384/77384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m440s[0m 5ms/step - loss: 21827.9883
Epoch 5/20
[1m77384/77384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m385s[0m 5ms/step - loss: 21613.8301
Epoch 6/20
[1m77384/77384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m447s[0m 5ms/step - loss: 21672.6523
Epoch 7/20
[1m77384/77384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m476s[0m 5ms/step - loss: 21357.8496
Epoch 8/20
[1m77384/77384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m400s[0m 5ms/step - loss: 21129.9414
Epoch 9/20
[1m77384/77384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m441s[0m 5ms/step - loss: 21362.2324
E

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Prepare the features for Random Forest
X_train_rf = np.concatenate([X_train.reshape(X_train.shape[0], -1), train_data['Domain'].values.reshape(-1, 1)], axis=1)
X_test_rf = np.concatenate([X_test.reshape(X_test.shape[0], -1), test_data['Domain'].values.reshape(-1, 1)], axis=1)

# Train Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train_rf, y_train)

# Make predictions
rf_pred = rf_model.predict(X_test_rf)

ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 2476275 and the array at index 1 has size 2505449

In [None]:
from sklearn.metrics import classification_report, roc_auc_score

# LSTM Model Evaluation
print("LSTM Model Performance:")
print(classification_report(y_test, y_pred > 0.5))

# Random Forest Evaluation
print("Random Forest Performance:")
print(classification_report(y_test, rf_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, rf_pred))


In [None]:
# Save the models
model.save('lstm_model.h5')
import joblib
joblib.dump(rf_model, 'random_forest_model.pkl')

# These models can now be loaded and used for continuous prediction in production
