In [31]:
import pandas as pd
import numpy as np
import os
from google.colab import drive
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from mlxtend.frequent_patterns import apriori, association_rules
import joblib
from sklearn.impute import SimpleImputer

# Mount Google Drive
drive.mount('/content/drive')

# Path of folder
folder_path = '/content/drive/MyDrive/Colab Notebooks/2024_08_09/'

# Get list of all CSV files in the folder
csv_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.csv')]

# Initialize an empty list to hold the dataframes
dfs = []

# Loop through the CSV files and read each one into a dataframe
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate all dataframes into a single dataframe
data = pd.concat(dfs, ignore_index=True)
data.head()


  and should_run_async(code)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  df = pd.read_csv(file)


Unnamed: 0,Last Occurred On,Alarm Source,Alarm Severity,Alarm Name,Alarm ID,Alarm Location Info,Site Name,Cleared On,First Occurred On,Clearance Status,Ticket ID,Site ID,Vendor,Domain,Device Type,Prediction Probability
0,2024-08-09 15:25:42,LOGGALOYA-MWI-CETR-VLL,Critical,Link Down,3,source=LOGGALOYA-MWI-CETR-VLL location=If Inde...,Loggaloya,2024-08-09 15:25:45,2024-08-09 15:25:42,Cleared,,BD0070,HUAWEI,IPCore,Router,
1,2024-08-09 15:25:28,LOGGALOYA-MWI-CETR-VLL,Critical,Link Down,3,source=LOGGALOYA-MWI-CETR-VLL location=If Inde...,Loggaloya,2024-08-09 15:25:30,2024-08-09 15:25:28,Cleared,,BD0070,HUAWEI,IPCore,Router,
2,2024-08-09 15:25:13,LOGGALOYA-MWI-CETR-VLL,Critical,Link Down,3,source=LOGGALOYA-MWI-CETR-VLL location=If Inde...,Loggaloya,2024-08-09 15:25:15,2024-08-09 15:25:13,Cleared,,BD0070,HUAWEI,IPCore,Router,
3,2024-08-09 15:24:57,LOGGALOYA-MWI-CETR-VLL,Critical,Link Down,3,source=LOGGALOYA-MWI-CETR-VLL location=If Inde...,Loggaloya,2024-08-09 15:25:00,2024-08-09 15:24:57,Cleared,,BD0070,HUAWEI,IPCore,Router,
4,2024-08-09 15:24:42,LOGGALOYA-MWI-CETR-VLL,Critical,Link Down,3,source=LOGGALOYA-MWI-CETR-VLL location=If Inde...,Loggaloya,2024-08-09 15:24:45,2024-08-09 15:24:42,Cleared,,BD0070,HUAWEI,IPCore,Router,


In [35]:
# Assuming 'First Occured On' is the column with dates and it's in string format
reference_date = pd.to_datetime('2024-08-09')  # Replace with your reference date

# Check if 'First Occured On' is in the columns, if not handle the error
if 'First Occurred On' in data.columns:
    # Convert 'First Occured On' to datetime objects
    data['First Occurred On'] = pd.to_datetime(data['First Occurred On'])

    # Calculate the difference between each date and the reference date in days
    data['Relative Day Index'] = (data['First Occurred On'] - reference_date).dt.days

    # Now you have a 'Relative Day Index' column with days relative to your reference date
    print(data.head())
else:
    print("Error: 'First Occurred On' column not found in the DataFrame.")
    # Handle the error appropriately, e.g., check your CSV files or data source

  and should_run_async(code)


      Last Occurred On            Alarm Source Alarm Severity Alarm Name  \
0  2024-08-09 15:25:42  LOGGALOYA-MWI-CETR-VLL       Critical  Link Down   
1  2024-08-09 15:25:28  LOGGALOYA-MWI-CETR-VLL       Critical  Link Down   
2  2024-08-09 15:25:13  LOGGALOYA-MWI-CETR-VLL       Critical  Link Down   
3  2024-08-09 15:24:57  LOGGALOYA-MWI-CETR-VLL       Critical  Link Down   
4  2024-08-09 15:24:42  LOGGALOYA-MWI-CETR-VLL       Critical  Link Down   

  Alarm ID                                Alarm Location Info  Site Name  \
0        3  source=LOGGALOYA-MWI-CETR-VLL location=If Inde...  Loggaloya   
1        3  source=LOGGALOYA-MWI-CETR-VLL location=If Inde...  Loggaloya   
2        3  source=LOGGALOYA-MWI-CETR-VLL location=If Inde...  Loggaloya   
3        3  source=LOGGALOYA-MWI-CETR-VLL location=If Inde...  Loggaloya   
4        3  source=LOGGALOYA-MWI-CETR-VLL location=If Inde...  Loggaloya   

            Cleared On   First Occurred On Clearance Status Ticket ID Site ID  \
0  20

In [46]:
# Define target alarm
target_alarm = 'RF Unit TX Channel Gain Out of Range'

# Filter data for the target alarm
target_data = data[data['Alarm Name'] == target_alarm]

# Get the unique Site IDs for the target alarm
site_ids = target_data['Site ID'].unique()

# Filter the original data to include only the alarms for these Site IDs
site_alarm_data = data[data['Site ID'].isin(site_ids)]

# Sort by Site ID and Date
site_alarm_data = site_alarm_data.sort_values(by=['Site ID', 'First Occurred On'])

# Group alarms by Site ID and Relative Day Index
grouped = site_alarm_data.groupby(['Site ID', 'Relative Day Index'])['Alarm Name'].apply(list)

# Create a binary occurrence matrix for alarms
pivot_df = grouped.apply(lambda x: pd.Series({alarm: 1 for alarm in x})).unstack(fill_value=0)

# Apply Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(pivot_df, min_support=0.05, use_colnames=True)  # Adjust min_support as needed

# Generate association rules with the target alarm as the consequent
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
target_rules = rules[rules['consequents'] == frozenset([target_alarm])]

print(target_rules)

  and should_run_async(code)


ValueError: The allowed values for a DataFrame are True, False, 0, 1. Found value nan

In [37]:
# Create a binary occurrence matrix for alarms
pivot_df = site_alarm_data.pivot_table(index=['Site ID', 'Relative Day Index'],
                                       columns='Alarm Name',
                                       aggfunc='size', fill_value=0)
pivot_df = (pivot_df > 0).astype(int)

# Apply Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(pivot_df, min_support=0.005, use_colnames=True)
frequent_itemsets.head()

  and should_run_async(code)


Unnamed: 0,support,itemsets
0,0.010851,((PTP)Time synchronization faulty--2501553)
1,0.020868,((Sync E)The state of clock source is failed--...
2,0.010017,(AM_DOWNSHIFT)
3,0.010851,(Ambient Temperature Unacceptable)
4,0.068447,(BBU CPRI Interface Error)


In [42]:
# Generate association rules with the target alarm as the consequent
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
target_rules = rules[rules['consequents'] == frozenset([target_alarm])]

# You can now work with the 'target_rules' DataFrame directly,
# which contains the association rules related to your target alarm.
# For example, you can display the rules:
print(target_rules)

  and should_run_async(code)


                                             antecedents  \
41                                  (Carrier No Traffic)   
1246                    (Cell Out of Service, Link Down)   
1305           (Cell Out of Service, Physical Port Down)   
1574                (Cell Unavailable, User Plane Fault)   
7497           (Cell Out of Service, Link Down, ETH_LOS)   
7539   (Cell Out of Service, ETH_LOS, Physical Port D...   
8213   (Cell Out of Service, Link Down, Physical Port...   
25658  (Cell Out of Service, Link Down, ETH_LOS, Phys...   

                                  consequents  antecedent support  \
41     (RF Unit TX Channel Gain Out of Range)            0.013356   
1246   (RF Unit TX Channel Gain Out of Range)            0.011686   
1305   (RF Unit TX Channel Gain Out of Range)            0.011686   
1574   (RF Unit TX Channel Gain Out of Range)            0.030050   
7497   (RF Unit TX Channel Gain Out of Range)            0.011686   
7539   (RF Unit TX Channel Gain Out of Range)

In [25]:




# Add lag features for common alarms
if common_alarms:
    for alarm in common_alarms:
        alarm_flag = (site_alarm_data['Alarm Name'] == alarm).astype(int)
        site_alarm_data[alarm] = alarm_flag

        # Align the new column with the existing index
        alarm_lag = site_alarm_data.groupby('Site ID')[alarm].shift(1).fillna(0).astype(int)
        site_alarm_data[alarm + '_lag'] = alarm_lag

# Define features for the model
features = [alarm + '_lag' for alarm in common_alarms] + ['Relative Day Index']

# Create the target variable indicating whether the target alarm occurs within the next 7 days
site_alarm_data['Target Next 7 Days'] = site_alarm_data.groupby('Site ID')['Alarm Name'].apply(
    lambda x: x.shift(-7).apply(lambda y: int(y == target_alarm))
).fillna(0).astype(int)

# Drop rows with missing target values
site_alarm_data.dropna(subset=['Target Next 7 Days'], inplace=True)

# Prepare features and target
X = site_alarm_data[features]
y = site_alarm_data['Target Next 7 Days']

# Handle missing values in features
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Now, X_imputed can be used for modeling


  and should_run_async(code)


TypeError: incompatible index of inserted column with frame index

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Normalize the features
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Save the model, scaler, and imputer
joblib.dump(model, 'rf_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(imputer, 'imputer.pkl')

In [None]:
# Load the model and scaler for prediction
model = joblib.load('rf_model.pkl')
scaler = joblib.load('scaler.pkl')
imputer = joblib.load('imputer.pkl')

# Get new data
new_csv_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.csv')]
new_dfs = [pd.read_csv(file) for file in new_csv_files]
new_data = pd.concat(new_dfs, ignore_index=True)

# Convert 'First Occurred On' to datetime and calculate 'Relative Day Index'
new_data['First Occurred On'] = pd.to_datetime(new_data['First Occurred On'])
new_data['Relative Day Index'] = (reference_date - new_data['First Occurred On']).dt.days

# Prepare features for prediction
if common_alarms:
    for alarm in common_alarms:
        new_data[alarm] = (new_data['Alarm Name'] == alarm).astype(int)
        new_data[alarm + '_lag'] = new_data.groupby('Site ID')[alarm].shift(1).fillna(0)

# Define features for prediction
new_data_features = new_data[features]

In [None]:
# Impute missing values and scale the features
new_data_imputed = pd.DataFrame(imputer.transform(new_data_features), columns=features)
new_data_scaled = scaler.transform(new_data_imputed)

# Predict probabilities
probabilities = model.predict_proba(new_data_scaled)[:, 1]
new_data['Probability of Target Alarm'] = probabilities

# Merge past alarms with predictions
result = new_data[['Site ID', 'Relative Day Index', 'Probability of Target Alarm']].copy()
result = result.merge(new_data[['Site ID', 'First Occurred On', 'Alarm Name', 'Relative Day Index']], on='Site ID', how='left')

# Filter and print high probability predictions
high_prob_predictions = result[result['Probability of Target Alarm'] > 0.5]

for site in high_prob_predictions['Site ID'].unique():
    print(f"Site ID: {site}")
    print(high_prob_predictions[high_prob_predictions['Site ID'] == site])
    print()

In [None]:
# import pandas as pd
# import numpy as np
# import os
# from google.colab import drive
# from sklearn.preprocessing import LabelEncoder, MinMaxScaler
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
# from mlxtend.frequent_patterns import apriori, association_rules
# import joblib
# from sklearn.impute import SimpleImputer

# # Mount Google Drive
# drive.mount('/content/drive')

# # Path of folder
# folder_path = '/content/drive/MyDrive/Colab Notebooks/2024_08_09/'

# # Get list of all CSV files in the folder
# csv_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.csv')]

# # Initialize an empty list to hold the dataframes
# dfs = []

# # Loop through the CSV files and read each one into a dataframe
# for file in csv_files:
#     df = pd.read_csv(file)
#     dfs.append(df)

# # Concatenate all dataframes into a single dataframe
# data = pd.concat(dfs, ignore_index=True)

In [None]:
# import pandas as pd
# import numpy as np
# from mlxtend.frequent_patterns import apriori, association_rules
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import MinMaxScaler
# from sklearn.ensemble import RandomForestClassifier
# import joblib

# # Assuming data is already loaded
# # Convert the 'Occurred On' to datetime
# data['First Occurred On'] = pd.to_datetime(data['First Occurred On'])

# # Filter data for the specific Alarm Name
# target_alarm = 'RF Unit TX Channel Gain Out of Range'
# filtered_data = data[data['Alarm Name'] == target_alarm]

# # Get the unique Site IDs
# site_ids = filtered_data['Site ID'].unique()

# # Print the Site IDs
# print("Site IDs with the target alarm:", site_ids)

# # Filter the initial dataset to get all alarms for the identified Site IDs
# alarms_for_site_ids = data[data['Site ID'].isin(site_ids)]

# # Print the filtered alarms
# print("Alarms for the identified Site IDs:")
# print(alarms_for_site_ids.head())

# # Sort by site and date
# df = alarms_for_site_ids.sort_values(by=['Site ID', 'First Occurred On'])

# # Filter the target alarm data
# df['Target'] = (df['Alarm Name'] == target_alarm).astype(int)

# # Define a time window for examining past alarms (e.g., 14 days)
# time_window = 14

# # Create a relative day index for each alarm
# df['Relative Day Index'] = df.groupby('Site ID')['First Occurred On'].transform(lambda x: (x - x.max()).dt.days)

# # Pivot the data to get the binary occurrence matrix for alarms
# pivot_df = df.pivot_table(index=['Site ID', 'Relative Day Index'],
#                           columns='Alarm Name',
#                           aggfunc='size', fill_value=0)

# # Binary encode the presence of alarms
# pivot_df = (pivot_df > 0).astype(int)

# # Apply the Apriori algorithm with a min_support
# frequent_itemsets = apriori(pivot_df, min_support=0.005, use_colnames=True)

# # Generate association rules with the target alarm as the consequent, using confidence as the metric
# rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
# target_rules = rules[rules['consequents'] == frozenset([target_alarm])]

# # Print all target rules to see the results
# print("All Target Rules:\n", target_rules)



In [None]:
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import MinMaxScaler
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.impute import SimpleImputer
# import joblib

# # Create a DataFrame to store the antecedent and their counts
# antecedent_counts = target_rules['antecedents'].apply(lambda x: list(x)[0] if x else None).value_counts()

# # Extract all common alarms
# common_alarms = antecedent_counts.index.tolist()

# print("All Common Alarms that frequently precede the target alarm:", common_alarms)

# # Create lag features for each common alarm
# if common_alarms:
#     for alarm in common_alarms:
#         # Create lag features only for common alarms that are identified
#         df[alarm + '_lag'] = df.groupby('Site ID')['Alarm Name'].transform(lambda x: (x == alarm).shift(1).fillna(0))

# # Add the Relative Day Index as a feature
# features = [alarm + '_lag' for alarm in common_alarms] + ['Relative Day Index']

# # Create a target variable indicating whether the target alarm occurs within the next 7 days
# df['Target Next 7 Days'] = df.groupby('Site ID')['Target'].shift(-7).fillna(0)

# # Drop rows with missing values in the target variable
# df.dropna(subset=['Target Next 7 Days'], inplace=True)

# # Ensure features list is not empty
# if features:
#     X = df[features]
#     y = df['Target Next 7 Days']

#     # Handle missing values in features
#     imputer = SimpleImputer(strategy='mean')  # You can choose other strategies as needed
#     X_imputed = imputer.fit_transform(X)

#     # Split the data into training and testing sets
#     X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

#     # Normalize the features
#     scaler = MinMaxScaler()
#     X_train_scaled = scaler.fit_transform(X_train)
#     X_test_scaled = scaler.transform(X_test)

#     # Train a Random Forest model
#     model = RandomForestClassifier(n_estimators=100, random_state=42)
#     model.fit(X_train_scaled, y_train)

#     # Save the model and scaler
#     joblib.dump(model, 'rf_model.pkl')
#     joblib.dump(scaler, 'scaler.pkl')
#     joblib.dump(imputer, 'imputer.pkl')  # Save the imputer for future use

# else:
#     print("No common alarms found. Cannot proceed with model training.")


In [None]:
# import os
# import pandas as pd
# import joblib

# # Load the model and scaler
# model = joblib.load('rf_model.pkl')
# scaler = joblib.load('scaler.pkl')

# # Path of the folder
# folder_path = '/content/drive/MyDrive/Colab Notebooks/2024_08_09/'

# # Get list of all CSV files in the folder
# csv_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.csv')]

# # Initialize an empty list to hold the dataframes
# dfs = []

# # Loop through the CSV files and read each one into a dataframe
# for file in csv_files:
#     df = pd.read_csv(file)
#     dfs.append(df)

# # Concatenate all dataframes into a single dataframe
# new_data = pd.concat(dfs, ignore_index=True)

In [None]:
# import pandas as pd
# from sklearn.impute import SimpleImputer
# from sklearn.preprocessing import MinMaxScaler
# import joblib

# # Load the new dataset
# # new_data = pd.read_csv('new_data.csv')  # Example, adjust as necessary

# # Convert 'First Occurred On' to datetime and calculate 'Relative Day Index'
# new_data['First Occurred On'] = pd.to_datetime(new_data['First Occurred On'])
# new_data['Relative Day Index'] = new_data.groupby('Site ID')['First Occurred On'].transform(lambda x: (x - x.max()).dt.days)

# # Get unique site IDs
# unique_sites = new_data['Site ID'].unique()

# # Create a dictionary to store past alarms for each site ID
# site_alarms = {}
# for site in unique_sites:
#     site_data = new_data[new_data['Site ID'] == site]
#     site_alarms[site] = site_data[['First Occurred On', 'Alarm Name', 'Relative Day Index']]
# 2
# # Prepare features for prediction
# # Create lag features for each common alarm
# for alarm in common_alarms:
#     new_data[alarm + '_lag'] = new_data.groupby('Site ID')['Alarm Name'].transform(lambda x: (x == alarm).shift(1).fillna(0))

# # Define the feature list for prediction
# features = [alarm + '_lag' for alarm in common_alarms] + ['Relative Day Index']
# new_data_features = new_data[features]

# # Impute missing values and scale the features
# imputer = SimpleImputer(strategy='mean')
# new_data_imputed = pd.DataFrame(imputer.fit_transform(new_data_features), columns=features)

# scaler = joblib.load('scaler.pkl')
# new_data_scaled = scaler.transform(new_data_imputed)

# # Load the saved model and predict probabilities
# model = joblib.load('rf_model.pkl')
# probabilities = model.predict_proba(new_data_scaled)[:, 1]
# new_data['Probability of Target Alarm'] = probabilities

# # Merge past alarms with predictions
# result = new_data[['Site ID', 'Relative Day Index', 'Probability of Target Alarm']].copy()
# result = result.merge(new_data[['Site ID', 'First Occurred On', 'Alarm Name', 'Relative Day Index']], on='Site ID', how='left')

# # Filter and print the results with probabilities greater than 0.5
# high_prob_predictions = result[result['Probability of Target Alarm'] > 0.5]

# # Print the high probability predictions along with past alarms
# for site in high_prob_predictions['Site ID'].unique():
#     print(f"Site ID: {site}")
#     print(high_prob_predictions[high_prob_predictions['Site ID'] == site])
#     print()


In [None]:
# print([prob for prob in model.predict_proba(X_test_scaled)[:, 1] if prob > 0.5])


In [None]:
# import os
# import pandas as pd
# import joblib

# # Load the model and scaler
# model = joblib.load('rf_model.pkl')
# scaler = joblib.load('scaler.pkl')

# # Path of the folder
# folder_path = '/content/drive/MyDrive/Colab Notebooks/2024_08_09/'

# # Get list of all CSV files in the folder
# csv_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.csv')]

# # Initialize an empty list to hold the dataframes
# dfs = []

# # Loop through the CSV files and read each one into a dataframe
# for file in csv_files:
#     df = pd.read_csv(file)
#     dfs.append(df)

# # Concatenate all dataframes into a single dataframe
# new_data = pd.concat(dfs, ignore_index=True)

# # Convert 'First Occurred On' to datetime
# new_data['First Occurred On'] = pd.to_datetime(new_data['First Occurred On'])

# # Ensure that the columns in the new data match the features used during model training
# columns_in_model = scaler.feature_names_in_
# available_features = [feature for feature in columns_in_model if feature in new_data.columns]

# # Check if there are any available features
# if not available_features:
#     print("No features from the model are present in the new dataset. Prediction cannot proceed.")
# else:
#     # If some features are missing, inform the user
#     if len(available_features) < len(columns_in_model):
#         print(f"Some features are missing. Proceeding with available features: {available_features}")

#     # Scale the new data based on the available features
#     X_new_scaled = scaler.transform(new_data[available_features])

#     # Predict the probability of the target alarm occurring within the next 7 days
#     pred_prob = model.predict_proba(X_new_scaled)[:, 1]

#     # Add the prediction probability to the new data
#     new_data['Prediction Probability'] = pred_prob

#     # Extract and print unique Site IDs
#     unique_site_ids = new_data['Site ID'].unique()
#     print("Unique Site IDs:")
#     print(unique_site_ids)

#     # Optionally, you can save the predictions and unique Site IDs to a new CSV file
#     output_file = os.path.join(folder_path, 'predictions_with_unique_site_ids.csv')
#     new_data[['Site ID', 'Prediction Probability']].drop_duplicates().to_csv(output_file, index=False)
#     print(f"Predictions and unique Site IDs saved to {output_file}")


In [None]:
# # Print the features used in the model
# features_in_model = scaler.feature_names_in_
# print("Features used in the model:")
# print(features_in_model)