<a href="https://colab.research.google.com/github/divanshu1993/security-anomalies-samples/blob/main/Model_Generation_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Model Generation using Isolation Forest algo**

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import IsolationForest
import re
import joblib

# Step 1: Load and preprocess the test data
log_data = pd.read_csv('https://raw.githubusercontent.com/divanshu1993/security-anomalies-samples/main/add_user_logs.csv')

# Extract the log messages from the log_data DataFrame
log_messages = log_data['message'].tolist()

# Remove timestamp, [main], INFO, and file path from log messages
cleaned_log_messages = []
for log_message in log_messages:
    cleaned_message = re.sub(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+|\[\w+\]|INFO|-\s?[\/\w]+', '', log_message)
    cleaned_log_messages.append(cleaned_message.strip())

# Convert log messages to TF-IDF vectors
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(cleaned_log_messages)

# Apply Isolation Forest for anomaly detection
isolation_forest = IsolationForest(contamination=0.01)
isolation_forest.fit(X)

# Save the trained Isolation Forest model
joblib.dump(isolation_forest, "isolation_forest_model.pkl")

print("Model saved successfully!")


## **Model Generation using KNN algo**

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import re
import joblib

# Step 1: Load and preprocess the test data
log_data = pd.read_csv('https://raw.githubusercontent.com/divanshu1993/security-anomalies-samples/main/add_user_logs.csv')

# Extract the log messages from the log_data DataFrame
log_messages = log_data['message'].tolist()

# Remove timestamp, [main], INFO, and file path from log messages
cleaned_log_messages = []
for log_message in log_messages:
    cleaned_message = re.sub(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+|\[\w+\]|INFO|-\s?[\/\w]+', '', log_message)
    cleaned_log_messages.append(cleaned_message.strip())

# Convert log messages to TF-IDF vectors
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(cleaned_log_messages)

# Apply K-nearest neighbors for anomaly detection
k = 5  # Number of neighbors
knn = NearestNeighbors(n_neighbors=k)
knn.fit(X)

# Save the trained K-nearest neighbors model
joblib.dump(knn, "knn_model.pkl")

print("Model saved successfully!")


## **Model Generation using OCSVM algo**

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import OneClassSVM
import re
import joblib

# Step 1: Load and preprocess the test data
log_data = pd.read_csv('https://raw.githubusercontent.com/divanshu1993/security-anomalies-samples/main/add_user_logs.csv')

# Extract the log messages from the log_data DataFrame
log_messages = log_data['message'].tolist()

# Remove timestamp, [main], INFO, and file path from log messages
cleaned_log_messages = []
for log_message in log_messages:
    cleaned_message = re.sub(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+|\[\w+\]|INFO|-\s?[\/\w]+', '', log_message)
    cleaned_log_messages.append(cleaned_message.strip())

# Convert log messages to TF-IDF vectors
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(cleaned_log_messages)

# Apply One-Class Support Vector Machines for anomaly detection
ocsvm = OneClassSVM(nu=0.01)
ocsvm.fit(X)

# Save the trained One-Class SVM model
joblib.dump(ocsvm, "ocsvm_model.pkl")

print("Model saved successfully!")
