In [2]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold

years = [2022, 2021, 2020]
# Number of folds
n_folds = 5
# Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)



# Initialize an empty list to store DataFrames
dfs = []

# Loop through each year, read the corresponding file and append to the list
for year in years:
    file_path = f'cleaned_datasets/Cleaned_Crimes_{year}.csv'  # Adjust the file path as needed
    df = pd.read_csv(file_path)
    dfs.append(df)

# Concatenate all DataFrames in the list into one
combined_df = pd.concat(dfs, ignore_index=True)

# Parse the date column to extract day, month, and year
combined_df['Date'] = pd.to_datetime(combined_df['Date'])
combined_df['Day'] = combined_df['Date'].dt.day
combined_df['Month'] = combined_df['Date'].dt.month
combined_df['Year'] = combined_df['Date'].dt.year

combined_df['Location Description'], _ = pd.factorize(combined_df['Location Description'])
combined_df['Primary Type'], _ = pd.factorize(combined_df['Primary Type'])

# Selecting relevant columns
features = ['Community Area', 'Location Description', 'Day', 'Month', 'Year']
target = 'Primary Type'

# Split the dataset
X = combined_df[features]
y = combined_df[target]

# Scale the features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.4, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


# models to train
models = {
    'Multinomial Logistic Regression': LogisticRegression(multi_class='multinomial', max_iter=1000),
    #'Decision Tree': DecisionTreeClassifier(),
    #'Random Forest': RandomForestClassifier(),
    #'Naïve Bayes': MultinomialNB(),
    #'K-nearest Neighbors': KNeighborsClassifier(),
    #'Support Vector Machine': SVC(),
    #'Neural Network': MLPClassifier(max_iter=1000)
}

for name, model in models.items():
    print(f"Training and evaluating model: {name}")

    accuracy_scores = []
    for train_index, val_index in skf.split(X_train, y_train):
        X_traink, X_valk = X_train[train_index], X_train[val_index]
        y_traink, y_valk = y_train[train_index], y_train[val_index]

        try:
            model.fit(X_train, y_train)
            predictions = model.predict(X_val)
            accuracy = accuracy_score(y_val, predictions)
            accuracy_scores.append(accuracy)
            print(f"Fold accuracy: {accuracy}")
        except Exception as e:
            print(f"Error training {name}: {e}")
            continue

    average_accuracy = np.mean(accuracy_scores)
    print(f"Average Accuracy for {name}: {average_accuracy}\n")


Training and evaluating model: Multinomial Logistic Regression


IndexError: index 131315 is out of bounds for axis 0 with size 131310