In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

In [10]:
# Load the dataset
data = pd.read_excel('Banduan_v2.xlsx')

# Preprocess the dataset
# Encoding categorical features
label_encoder = LabelEncoder()
for col in ['jantina', 'negeri', 'bangsa', 'kategori_hukuman', 'banduan_berulang_kali', 'kesalahan', 'penyakit', 'program_pemulihan', 'status']:
    data[col] = label_encoder.fit_transform(data[col].astype(str))

# Generating synthetic data for 'Gang Affiliation'
np.random.seed(0)
gang_affiliation_categories = ['SG', 'NG', 'DG', 'G']
data['Gang Affiliation'] = np.random.choice(gang_affiliation_categories, size=len(data))
data['Gang Affiliation'] = label_encoder.fit_transform(data['Gang Affiliation'])

# Generating a synthetic target variable for fight involvement
data['Fight Involvement'] = np.random.choice(['No Fight', 'Fight', 'Alliance'], size=len(data))

In [11]:
# Selecting features and target
features = data[['umur', 'jantina', 'bangsa', 'tempoh_hukuman', 'banduan_berulang_kali', 'kesalahan', 'penyakit', 'program_pemulihan', 'Gang Affiliation']]
target = label_encoder.fit_transform(data['Fight Involvement'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Train the model on the training set
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

In [12]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Accuracy: 0.3440740740740741
Classification Report:
              precision    recall  f1-score   support

    Alliance       0.35      0.36      0.35       880
       Fight       0.34      0.34      0.34       916
    No Fight       0.34      0.33      0.34       904

    accuracy                           0.34      2700
   macro avg       0.34      0.34      0.34      2700
weighted avg       0.34      0.34      0.34      2700



In [13]:
# Converting predictions back to labels
predicted_labels = label_encoder.inverse_transform(y_pred)

# Add predictions back to the test set
test_set_with_predictions = X_test.copy()
test_set_with_predictions['Fight'] = predicted_labels

# Merge the test set predictions back into the original dataset
# This will fill in the missing 'Fight' values
data.update(test_set_with_predictions)

# Saving the updated dataset with predictions for missing values
data.to_excel('V2_fight_predictions.xlsx', index=False)

print("Predictions for missing values in 'Fight' column added and saved as 'V2_fight_predictions.xlsx'")

Predictions for missing values in 'Fight' column added and saved as 'V2_fight_predictions.xlsx'
