Preprocess the Data

In [14]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
import pickle


# Daten werden eingelesen
crimes_dataset = pd.read_csv(r"Crimes_Dataset.csv")
suspects_dataset = pd.read_csv(r"Suspects_Dataset.csv")

#kleinbuchstaben
crimes_dataset = crimes_dataset.applymap(lambda x: x.lower() if isinstance(x, str) else x)   # Alle Zeichenfolgen in den DataFrame in Kleinbuchstaben umwandeln
crimes_dataset

suspects_dataset = suspects_dataset.applymap(lambda x: x.lower() if isinstance(x, str) else x)    # Alle Zeichenfolgen in den DataFrame in Kleinbuchstaben umwandeln
suspects_dataset

merged_df = pd.merge(crimes_dataset, suspects_dataset, left_on="Index_Crimes", right_on="Index_Monster")
merged_df

merged_df.to_csv('merged_df.csv', index=False)

In [15]:
# Laden des Datensatzes
file_path = "merged_df.csv"  # Pfad zu zusammengeführten Datei
df = pd.read_csv(file_path)

# Ersten Blick auf den Datensatz werfen
print(df.head())

# Datenvorbereitung
# Umwandlung der Datums-Spalte in das richtige Format
df['Date'] = pd.to_datetime(df['Date'])

# Label Encoding für die Spalten mit kategorischen Daten
label_cols = ['Crime Type', 'Region', 'Crime Weapon', 'Time of Day', 'Evidence Found', "Criminal record", "Allergy", "Favorite Food"]
le = LabelEncoder()

for col in label_cols:
    df[col] = le.fit_transform(df[col].astype(str))

# Entfernen von möglichen fehlenden Werten (falls vorhanden)
df = df.dropna()

# Features und Zielvariable definieren
X = df[['Crime Type', 'Region', 'Crime Weapon', 'Time of Day', 'Evidence Found', "Criminal record", "Allergy", "Favorite Food"]]
y = df['Monster involved']

# Zielvariable in numerische Werte umwandeln
y = le.fit_transform(y)


# Daten in Trainings- und Testdaten aufteilen
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verschiedene Modelle trainieren und vergleichen
models = {
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000)
}

# Beste Modellbewertung
best_model = None
best_score = 0

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy for {model_name}: {accuracy:.2f}")
    if accuracy > best_score:
        best_score = accuracy
        best_model = model_name

print(f"\nBest Model: {best_model} with Accuracy: {best_score:.2f}")

# Ausführlicher Bericht des besten Modells
best_model_instance = models[best_model]
y_pred_best = best_model_instance.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred_best))

   Index_Crimes        Date Monster involved  Days of Investigation    Region  \
0          6316  2020-02-18         skeleton                   77.0  mountain   
1          4731  2022-09-01         werewolf                   48.0    forest   
2          1750  2022-08-03            ghost                   31.0    castle   
3          4789  2023-10-18            witch                   29.0     swamp   
4          4567  2021-03-25            ghost                   59.0   village   

            Crime Type Crime Weapon Time of Day Evidence Found  Index_Monster  \
0  nightly disturbance          NaN       night          bones           6316   
1              assault       pistol       night          teeth           4731   
2  nightly disturbance          NaN       night        potions           1750   
3                arson       pistol        dawn        potions           4789   
4  nightly disturbance       pistol       night        potions           4567   

    Monster Criminal recor

In [16]:
#Modell abspeichern

with open('random_forest_model.pkl', 'wb') as file:
    pickle.dump(best_model_instance, file)



In [17]:
# Laden des Modells
with open('random_forest_model.pkl', 'rb') as file:
    best_model_instance = pickle.load(file)

In [18]:
new_data = pd.DataFrame({
    'Crime Type': ["kidnapping"],  # Beispielwert (z.B. 1 für 'Vampire')      kidnapping 
              
    'Region': ["village"],  # Beispielwert für 'castle'  village
    'Crime Weapon': ["knife"],  # Beispielwert für 'axe'     knife
    'Time of Day': ["day"],  # Beispielwert für 'night'          day
    'Evidence Found': ["bones"] , # Beispielwert für 'blood'         bones 
    "Criminal record": ["yes"]  ,  #Beispielwert für criminal record        ja
    "Allergy":["sunlight"]    ,        #Beispielwert fpr allergie            sonne
    "Favorite Food":["bones"]            #Beispielwert für lieblingsessen  knochen
    
}) 

label_cols = ['Crime Type', 'Region', 'Crime Weapon', 'Time of Day', 'Evidence Found', "Criminal record", "Allergy", "Favorite Food"]
le = LabelEncoder()

for col in label_cols:
    new_data[col] = le.fit_transform(new_data[col].astype(str))

# Vorhersagen mit dem trainierten Modell
new_prediction = best_model_instance.predict(new_data)


# Rücktransformation der Vorhersage in den ursprünglichen Crime Type
#predicted_crime_type = le.inverse_transform([1])

# Ausgabe der tatsächlichen Vorhersage
#print("Vorhergesagter Crime Type für die neuen Daten:", predicted_crime_type)
 

new_prediction 

array([2])

In [19]:
from sklearn.preprocessing import LabelEncoder

# Angenommen, dies ist der ursprüngliche LabelEncoder für 'Crime Type'
label_encoder = LabelEncoder()

# Fitte den Encoder auf die Trainingsdaten (angenommen df['Crime Type'] ist der Trainingsdatensatz)
label_encoder.fit(df['Monster involved'])

# Zeige das Mapping von Klassen (Kategorien) zu numerischen Werten
print("Mapping von Crime Type zu numerischen Werten:")
for index, class_name in enumerate(label_encoder.classes_):
    print(f"'{class_name}' is encoded as {index}")

Mapping von Crime Type zu numerischen Werten:
'ghost' is encoded as 0
'skeleton' is encoded as 1
'vampire' is encoded as 2
'werewolf' is encoded as 3
'witch' is encoded as 4
'zombie' is encoded as 5


Grid search oder random search / cross validation