In [3]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta

# Initialize Faker for generating realistic data
fake = Faker()

# Predefined phrases for realistic data
incident_summaries = [
    'Server outage caused by network failure',
    'Database crash due to hardware issue',
    'High latency observed during peak hours',
    'User login issues reported',
    'Email service outage due to spam filter issue',
    'Network congestion leading to slow internet',
    'Disk failure on primary database server',
    'Power outage affecting data center operations',
    'Software bug causing intermittent crashes',
    'Security breach detected in user accounts',
    'API response times increased during peak hours',
    'Memory leak in application server'
]
resolution_texts = [
    'Rebooted server and reset network configurations',
    'Replaced faulty hardware components',
    'Scaled up resources to handle peak traffic',
    'Fixed user authentication service',
    'Adjusted spam filter settings and restored service',
    'Optimized network traffic routing and cleared congestion',
    'Replaced faulty disk and restored database from backup',
    'Restored power and restarted affected systems',
    'Patched software to fix the bug',
    'Secured user accounts and investigated breach',
    'Scaled up API servers to handle increased load',
    'Fixed memory leak and restarted application server'
]
root_causes = [
    'Network failure',
    'Hardware issue',
    'Resource bottleneck',
    'Authentication service bug',
    'Spam filter issue',
    'Network congestion',
    'Disk failure',
    'Power outage',
    'Software bug',
    'Security breach',
    'High traffic',
    'Memory leak'
]
departments = ['IT', 'Ops', 'DevOps', 'Security']
platform_types = ['Cloud', 'Hardware', 'Web', 'Database']

# Generate dataset with 100 records
data = {
    'IncidentSummary': [random.choice(incident_summaries) for _ in range(100)],
    'ResolutionText': [random.choice(resolution_texts) for _ in range(100)],
    'RootCause': [random.choice(root_causes) for _ in range(100)],
    'HoursOfDowntime': np.random.normal(4, 2, 100).round(2),  # Shift mean to make downtime more realistic
    'OverallSeverity': np.random.choice(['low', 'medium', 'high', 'critical'], 100),
    'Department': np.random.choice(departments, 100),
    'PlatformType': np.random.choice(platform_types, 100),
    'CompanyAccountability': np.random.choice(['Yes', 'No'], 100)
}

# Create DataFrame
df = pd.DataFrame(data)

# Introduce some outliers
outlier_indices = np.random.choice(df.index, size=5, replace=False)
df.loc[outlier_indices, 'HoursOfDowntime'] *= 3

# Ensure that some relationships are not immediately obvious
for i in range(len(df)):
    if df.loc[i, 'OverallSeverity'] == 4:  # Critical severity
        df.loc[i, 'HoursOfDowntime'] += np.random.uniform(1, 5)
    elif df.loc[i, 'OverallSeverity'] == 1:  # Low severity
        df.loc[i, 'HoursOfDowntime'] -= np.random.uniform(0.5, 2)
    else:  # Medium and High severity
        df.loc[i, 'HoursOfDowntime'] += np.random.uniform(-1, 3)

# Convert categorical variables to numerical
df['OverallSeverity'] = df['OverallSeverity'].map({'low': 1, 'medium': 2, 'high': 3, 'critical': 4})
df['CompanyAccountability'] = df['CompanyAccountability'].map({'Yes': 1, 'No': 0})

# Perform one-hot encoding on 'Department' and 'PlatformType'
df = pd.get_dummies(df, columns=['Department', 'PlatformType'])

# Generate random timestamps over three months
start_date = datetime.now() - timedelta(days=90)
timestamps = [start_date + timedelta(days=random.randint(0, 90)) for _ in range(100)]
df['Timestamp'] = timestamps

# Introduce a correlation between Timestamps and HoursOfDowntime
df = df.sort_values('Timestamp').reset_index(drop=True)
df['HoursOfDowntime'] = df['HoursOfDowntime'] + np.linspace(0, 5, num=len(df))

# Shuffle DataFrame to randomize timestamp order
df = df.sample(frac=1).reset_index(drop=True)

# Save the DataFrame to a CSV file
df.to_csv('dataset.csv', index=False)

print("Dataset saved as 'dataset.csv'")


Dataset saved as 'dataset.csv'
