In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import gzip

# Load the cleaned dataset
file_path = 'cleaned_crime_data.csv.gz'
with gzip.open(file_path, 'rt') as f:
    df = pd.read_csv(f)

# Split the dataset into 70% training and 30% testing sets using stratified sampling
X = df.drop('label', axis=1)
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Display statistics for both the training and test sets to verify the proportions
train_stats = y_train.value_counts(normalize=True)
test_stats = y_test.value_counts(normalize=True)

print("Training set label distribution:")
print(train_stats)
print("\nTest set label distribution:")
print(test_stats)

KeyError: "['label'] not found in axis"

In [3]:
print(df.columns)

Index(['Date Rptd', 'DATE OCC', 'TIME OCC', 'AREA', 'AREA NAME', 'Rpt Dist No',
       'Part 1-2', 'Crm Cd', 'Crm Cd Desc', 'Mocodes',
       ...
       'Premis Desc_VEHICLE STORAGE LOT (CARS, TRUCKS, RV'S, BOATS, TRAILERS, ETC.)',
       'Premis Desc_VEHICLE, PASSENGER/TRUCK',
       'Premis Desc_VETERINARIAN/ANIMAL HOSPITAL',
       'Premis Desc_VIDEO RENTAL STORE', 'Premis Desc_VISION CARE FACILITY*',
       'Premis Desc_WAREHOUSE', 'Premis Desc_WATER FACILITY',
       'Premis Desc_WEBSITE', 'Premis Desc_YARD (RESIDENTIAL/BUSINESS)',
       'Holiday'],
      dtype='object', length=339)


In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression

# Step 1: Display the total count of records in the training and test sets
print(f"Total records in training set: {len(X_train)}")
print(f"Total records in test set: {len(X_test)}")

# Display the class distribution in both training and test sets
print("\nTraining set class distribution:")
print(y_train.value_counts(normalize=True))
print("\nTest set class distribution:")
print(y_test.value_counts(normalize=True))

# Step 2: Perform 5-fold cross-validation on the training set

# Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Iterate through each fold
for fold, (train_index, val_index) in enumerate(skf.split(X_train, y_train), 1):
    X_fold_train, X_fold_val = X_train.iloc[train_index], X_train.iloc[val_index]
    y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Display the number of samples and class distribution for each fold
    print(f"\nFold {fold}:")
    print(f"Number of training samples: {len(X_fold_train)}")
    print(f"Number of validation samples: {len(X_fold_val)}")
    print("Training set class distribution:")
    print(y_fold_train.value_counts(normalize=True))
    print("Validation set class distribution:")
    print(y_fold_val.value_counts(normalize=True))
    
    # Example: Train a simple model and evaluate (replace with your model and metrics)
    
    model = LogisticRegression()
    model.fit(X_fold_train, y_fold_train)
    y_pred = model.predict(X_fold_val)
    
    # Calculate and display evaluation metrics
    accuracy = accuracy_score(y_fold_val, y_pred)
    f1 = f1_score(y_fold_val, y_pred, average='weighted')
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1-score: {f1:.4f}")