In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Number of records
num_records = 40000
num_users = 10

# Generate user_ids
user_ids = [f'user_{i+1}' for i in range(num_users)]

# Function to generate random timestamps
def generate_random_timestamps(start, end, n):
    start_u = start.timestamp()
    end_u = end.timestamp()
    return [datetime.fromtimestamp(random.uniform(start_u, end_u)) for _ in range(n)]

# Generate data
data = []
start_date = datetime(2023, 1, 1)
end_date = datetime(2023, 12, 31)

for _ in range(num_records):
    user_id = random.choice(user_ids)
    timestamp = random.choice(generate_random_timestamps(start_date, end_date, 1))
    location_change = random.randint(0, 1)
    call_duration = random.randint(0, 600)  # Call duration between 0 and 10 minutes
    message_count = random.randint(0, 20)  # Number of messages between 0 and 20
    anomaly = random.choices([0, 1], weights=[0.95, 0.05])[0]  # 5% of records are anomalies

    data.append([user_id, timestamp, location_change, call_duration, message_count, anomaly])

# Create DataFrame
columns = ['user_id', 'timestamp', 'location_change', 'call_duration', 'message_count', 'anomaly']
df = pd.DataFrame(data, columns=columns)

# Save to CSV
df.to_csv('dummy_data.csv', index=False)

print("Dummy dataset generated and saved to 'dummy_data.csv'.")


Dummy dataset generated and saved to 'dummy_data.csv'.


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('dummy_data.csv')

# Convert timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Drop any rows with missing values (if any)
df.dropna(inplace=True)

# Split data into features and labels
X = df[['user_id', 'timestamp', 'location_change', 'call_duration', 'message_count']]
y = df['anomaly']


In [3]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Encode user_id
label_encoder = LabelEncoder()
X['user_id'] = label_encoder.fit_transform(X['user_id'])

# Extract time-based features
X['hour'] = X['timestamp'].dt.hour
X['day_of_week'] = X['timestamp'].dt.dayofweek

# Drop the original timestamp column
X.drop('timestamp', axis=1, inplace=True)

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['user_id'] = label_encoder.fit_transform(X['user_id'])


In [4]:
from sklearn.ensemble import IsolationForest

# Initialize the model
model = IsolationForest(contamination=0.05)

# Train the model
model.fit(X_scaled)


In [5]:
# Predict anomalies on the training data
y_pred = model.predict(X_scaled)

# Convert predictions from -1 (anomaly) and 1 (normal) to 1 (anomaly) and 0 (normal)
y_pred = [1 if p == -1 else 0 for p in y_pred]

# Evaluate the model
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y, y_pred))
print(confusion_matrix(y, y_pred))


              precision    recall  f1-score   support

           0       0.95      0.95      0.95     37993
           1       0.05      0.05      0.05      2007

    accuracy                           0.91     40000
   macro avg       0.50      0.50      0.50     40000
weighted avg       0.90      0.91      0.90     40000

[[36097  1896]
 [ 1903   104]]


In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

# Load the dataset
df = pd.read_csv('dummy_data.csv')

# Convert timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Drop any rows with missing values (if any)
df.dropna(inplace=True)

# Split data into features and labels
X = df[['user_id', 'timestamp', 'location_change', 'call_duration', 'message_count']]
y = df['anomaly']

# Encode user_id
label_encoder = LabelEncoder()
X['user_id'] = label_encoder.fit_transform(X['user_id'])

# Extract time-based features
X['hour'] = X['timestamp'].dt.hour
X['day_of_week'] = X['timestamp'].dt.dayofweek

# Drop the original timestamp column
X.drop('timestamp', axis=1, inplace=True)

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize the model
model = IsolationForest(contamination=0.05, random_state=42)

# Train the model
model.fit(X_train)

# Predict anomalies on the test data
y_pred = model.predict(X_test)

# Convert predictions from -1 (anomaly) and 1 (normal) to 1 (anomaly) and 0 (normal)
y_pred = [1 if p == -1 else 0 for p in y_pred]

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Cross-validation
scores = cross_val_score(model, X_scaled, y, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy: {np.mean(scores):.2f} (+/- {np.std(scores):.2f})")

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_samples': ['auto', 0.5, 0.75],
    'contamination': [0.01, 0.05, 0.1],
    'max_features': [1.0, 0.5, 0.75],
}

grid_search = GridSearchCV(IsolationForest(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_scaled, y)

print("Best Parameters:")
print(grid_search.best_params_)

# Train the best model
best_model = grid_search.best_estimator_
best_model.fit(X_train)

# Predict using the best model
y_best_pred = best_model.predict(X_test)
y_best_pred = [1 if p == -1 else 0 for p in y_best_pred]

# Evaluate the best model
print("Best Model Classification Report:")
print(classification_report(y_test, y_best_pred))
print("Best Model Confusion Matrix:")
print(confusion_matrix(y_test, y_best_pred))
best_accuracy = accuracy_score(y_test, y_best_pred)
print(f"Best Model Accuracy: {best_accuracy:.2f}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['user_id'] = label_encoder.fit_transform(X['user_id'])


Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95      7618
           1       0.07      0.07      0.07       382

    accuracy                           0.91      8000
   macro avg       0.51      0.51      0.51      8000
weighted avg       0.91      0.91      0.91      8000

Confusion Matrix:
[[7216  402]
 [ 354   28]]
Accuracy: 0.91
Cross-Validation Accuracy: 0.05 (+/- 0.00)
Best Parameters:
{'contamination': 0.01, 'max_features': 1.0, 'max_samples': 'auto', 'n_estimators': 200}
Best Model Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97      7618
           1       0.07      0.02      0.03       382

    accuracy                           0.94      8000
   macro avg       0.51      0.50      0.50      8000
weighted avg       0.91      0.94      0.93      8000

Best Model Confusion Matrix:
[[7535   83]
 [ 376    6]]
Best Model Accuracy: 0.9