In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the CSV file
# Replace 'your_file.csv' with your actual file path
df = pd.read_csv('sample.csv')

# Display the first few rows to understand the data
print("Sample data:")
print(df.head())

# Find all records with errorCode 1001
error_1001_records = df[df['errorCode'] == 1001]
print(f"\nTotal records with errorCode 1001: {len(error_1001_records)}")

# Get all unique IDs that have errorCode 1001
ids_with_error_1001 = error_1001_records['id'].unique()
print(f"Unique IDs with errorCode 1001: {len(ids_with_error_1001)}")

# Find all records for these IDs
relevant_records = df[df['id'].isin(ids_with_error_1001)]

# Check if these IDs have other error codes
other_error_codes = relevant_records[relevant_records['errorCode'] != 1001]
print(f"\nRecords with other error codes for the same IDs: {len(other_error_codes)}")

# Count of each error code for these IDs
error_code_counts = relevant_records['errorCode'].value_counts().reset_index()
error_code_counts.columns = ['errorCode', 'count']
print("\nError code distribution for IDs that have error 1001:")
print(error_code_counts)

# Calculate stats on how many different error codes each ID has
error_codes_per_id = relevant_records.groupby('id')['errorCode'].nunique()
print("\nDistribution of how many different error codes each ID has:")
print(error_codes_per_id.value_counts().sort_index())

# Create visualizations

# 1. Bar chart of error code distribution
plt.figure(figsize=(10, 6))
sns.barplot(x='errorCode', y='count', data=error_code_counts)
plt.title('Distribution of Error Codes for IDs with Error 1001')
plt.xlabel('Error Code')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 2. Pie chart showing proportion of error 1001 vs other errors
plt.figure(figsize=(8, 8))
error_types = pd.DataFrame({
    'type': ['Error 1001', 'Other Errors'],
    'count': [len(error_1001_records), len(other_error_codes)]
})
plt.pie(error_types['count'], labels=error_types['type'], autopct='%1.1f%%', startangle=90)
plt.axis('equal')
plt.title('Proportion of Error 1001 vs Other Errors')
plt.show()

# 3. Histogram of number of error codes per ID
plt.figure(figsize=(10, 6))
sns.histplot(error_codes_per_id, bins=range(1, error_codes_per_id.max() + 2), kde=False)
plt.title('Number of Different Error Codes per ID')
plt.xlabel('Number of Different Error Codes')
plt.ylabel('Count of IDs')
plt.xticks(range(1, error_codes_per_id.max() + 1))
plt.tight_layout()
plt.show()

# Optional: Create a sample dataframe showing IDs with multiple error codes
ids_with_multiple_errors = error_codes_per_id[error_codes_per_id > 1].index
sample_multi_error_ids = ids_with_multiple_errors[:5]  # Take first 5 as examples

if len(sample_multi_error_ids) > 0:
    print("\nExample IDs with multiple error codes:")
    for id_val in sample_multi_error_ids:
        errors = relevant_records[relevant_records['id'] == id_val]['errorCode'].values
        print(f"ID {id_val} has error codes: {errors}")
else:
    print("\nNo IDs with multiple error codes found.")