In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt 
import os

ModuleNotFoundError: No module named 'sklearn'

In [None]:
# Baca dataset
dataset_path = "archive/data.csv"
dataset = pd.read_csv(dataset_path, delimiter='\t')

In [None]:
# Ekstrak kolom untuk Anxiety (Q2A, Q4A, Q7A, dst) menggunakan DASS keys
ANXIETY_QUESTIONS = [2, 4, 7, 9, 15, 19, 20, 23, 25, 28, 30, 36, 40, 41]
anxiety_columns = ['Q' + str(i) + 'A' for i in ANXIETY_QUESTIONS]

In [None]:
# Ambil hanya kolom yang berkaitan dengan Anxiety
anxiety_data = dataset[anxiety_columns]

# Kurangi 1 dari semua nilai (mengubah skala 1-4 menjadi 0-3)
anxiety_data = anxiety_data.subtract(1)

# Hitung total skor
anxiety_data['Total_Count'] = anxiety_data.sum(axis=1)

In [None]:
anxiety_data.info()

In [None]:
anxiety_data.duplicated().value_counts()

In [None]:
anxiety_data.nunique()

In [None]:
columns  = anxiety_data.columns

for column in columns:
    print(column)

In [None]:
# Fungsi untuk menentukan label tingkat kecemasan
def get_anxiety_level(score):
    if score <= 7:
        return 'Normal'
    elif score <= 9:
        return 'Mild'
    elif score <= 14:
        return 'Moderate'
    elif score <= 19:
        return 'Severe'
    else:
        return 'Extremely Severe'

In [None]:
# Tambahkan kolom label
anxiety_data['Label'] = anxiety_data['Total_Count'].apply(get_anxiety_level)

# Hapus kolom Total_Count karena sudah tidak diperlukan
final_anxiety_data = anxiety_data.drop(columns=['Total_Count'])

In [None]:
# Visualisasi distribusi label
plt.figure(figsize=(10, 6))
desired_labels = ['Extremely Severe', 'Severe', 'Moderate', 'Mild', 'Normal']
label_counts = final_anxiety_data['Label'].value_counts()

In [None]:
# Plot diagram batang
colors = ['skyblue', 'green', 'yellow', 'orange', 'gray']
label_counts_ordered = label_counts.reindex(desired_labels)
plt.bar(label_counts_ordered.index, label_counts_ordered.values, color=colors)

plt.xlabel('Anxiety Level')
plt.ylabel('Number of Respondents')
plt.title('Distribution of Anxiety Levels')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Tampilan beberapa statistik dasar
print("\nDistribusi Label Anxiety:")
print(label_counts_ordered)
print("\nInformasi Dataset:")
print(final_anxiety_data.info())

In [None]:
# 1. Encoding label kategori menjadi numerik
label_encoder = LabelEncoder()
final_anxiety_data['Label_Encoded'] = label_encoder.fit_transform(final_anxiety_data['Label'])

In [None]:
# 2. Memisahkan features (X) dan target (y)
X = final_anxiety_data.drop(['Label', 'Label_Encoded'], axis=1)  # Features (jawaban kuesioner)
y = final_anxiety_data['Label_Encoded']  # Target (label yang sudah diencode)

In [None]:
# 3. Normalisasi fitur menggunakan StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [None]:
# 4. Membagi data menjadi training dan testing set (80:20)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
# 5. informasi pembagian dataset
print("Ukuran dataset:")
print(f"Total dataset: {len(final_anxiety_data)}")
print(f"Training set: {len(X_train)} samples")
print(f"Testing set: {len(X_test)} samples")

In [None]:
# 6. Visualisasi distribusi
plt.figure(figsize=(12, 4))

# Plot untuk training set
plt.subplot(1, 2, 1)
plt.hist(y_train, bins=len(label_encoder.classes_), alpha=0.7)
plt.title('Distribusi Label Training Set')
plt.xlabel('Label (Encoded)')
plt.ylabel('Jumlah Samples')

# Plot untuk testing set
plt.subplot(1, 2, 2)
plt.hist(y_test, bins=len(label_encoder.classes_), alpha=0.7)
plt.title('Distribusi Label Testing Set')
plt.xlabel('Label (Encoded)')
plt.ylabel('Jumlah Samples')

plt.tight_layout()
plt.show()

In [None]:
# 7. Mencari nilai k optimal
k_range = range(1, 31, 2)
scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    scores.append(knn.score(X_test, y_test))

In [None]:
# Plot akurasi vs nilai k
plt.figure(figsize=(10, 6))
plt.plot(k_range, scores, 'bo-')
plt.xlabel('Nilai k')
plt.ylabel('Akurasi')
plt.title('Akurasi Model vs Nilai k')
plt.grid(True)
plt.show()

In [None]:
# Mendapatkan nilai k optimal
optimal_k = k_range[scores.index(max(scores))]
print(f"\nNilai k optimal: {optimal_k}")

In [None]:
# Inisialisasi model KNN dengan k optimal
knn_model = KNeighborsClassifier(n_neighbors=optimal_k)

# Training model
knn_model.fit(X_train, y_train)

# Prediksi
y_pred = knn_model.predict(X_test)

In [None]:
# Evaluasi model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")

In [None]:
# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

In [None]:
# Confusion Matrix
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
           xticklabels=label_encoder.classes_,
           yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix - KNN')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Simpan model
import pickle
with open('knn_anxiety_model.pkl', 'wb') as file:
    pickle.dump(knn_model, file)

# Simpan label encoder dan scaler
with open('label_encoder.pkl', 'wb') as file:
    pickle.dump(label_encoder, file)
with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)