In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from scipy import stats

import os
import json
import joblib
from datetime import datetime
import traceback
import sys

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("=" * 80)
print("AI LEARNING INSIGHT - MODEL DEVELOPMENT")
print("=" * 80)
print("Libraries imported successfully\n")

AI LEARNING INSIGHT - MODEL DEVELOPMENT
Libraries imported successfully



In [2]:
print("STEP 1: LOADING DATA FROM DATA SPECIALIST")
print("-" * 80)

# Load scaled features (output dari data specialist)
scaled_data = pd.read_csv('04_scaled_features_only_clean.csv')
ml_ready = pd.read_csv('03_ml_ready_dataset_clean.csv')
basic_summary = pd.read_csv('01_basic_user_summary_clean.csv', sep=';')

# Load feature documentation
with open('feature_documentation.json', 'r') as f:
    feature_docs = json.load(f)

# Load scalers
standard_scaler = joblib.load('standard_scaler.pkl')
minmax_scaler = joblib.load('minmax_scaler.pkl')

print(f"Scaled data shape: {scaled_data.shape}")
print(f"ML ready shape: {ml_ready.shape}")
print(f"Basic summary shape: {basic_summary.shape}")
print("Scalers loaded successfully\n")

STEP 1: LOADING DATA FROM DATA SPECIALIST
--------------------------------------------------------------------------------
Scaled data shape: (8, 49)
ML ready shape: (8, 77)
Basic summary shape: (8, 11)
Scalers loaded successfully



In [3]:
print("STEP 2: FEATURE PREPARATION")
print("-" * 80)

# Extract user IDs
user_ids = scaled_data['user_id'].values

# Get all numerical features (exclude user_id)
feature_columns = [col for col in scaled_data.columns if col != 'user_id']
X_clustering = scaled_data[feature_columns].values
feature_names = feature_columns

print(f"Features selected: {len(feature_names)}")
print(f"Feature matrix shape: {X_clustering.shape}")
print(f"Sample features: {feature_names[:5]}\n")

STEP 2: FEATURE PREPARATION
--------------------------------------------------------------------------------
Features selected: 48
Feature matrix shape: (8, 48)
Sample features: ['active_days_week_standard', 'consistency_score_standard', 'unique_tutorials_standard', 'total_accesses_standard', 'completed_count_standard']



In [4]:
print("STEP 3: OPTIMAL K DETERMINATION")
print("-" * 80)

k_range = range(2, min(7, len(X_clustering)))
metrics = {'k': [], 'wcss': [], 'silhouette': [], 'calinski': [], 'davies': []}

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10, max_iter=300)
    labels = kmeans.fit_predict(X_clustering)

    metrics['k'].append(k)
    metrics['wcss'].append(kmeans.inertia_)
    metrics['silhouette'].append(silhouette_score(X_clustering, labels))
    metrics['calinski'].append(calinski_harabasz_score(X_clustering, labels))
    metrics['davies'].append(davies_bouldin_score(X_clustering, labels))

    print(f"k={k}: Silhouette={metrics['silhouette'][-1]:.3f}, "
          f"WCSS={metrics['wcss'][-1]:.2f}")

# Select optimal k based on silhouette score
optimal_idx = np.argmax(metrics['silhouette'])
optimal_k = metrics['k'][optimal_idx]
print(f"\nOptimal K selected: {optimal_k}")
print(f"Best Silhouette Score: {metrics['silhouette'][optimal_idx]:.3f}\n")

STEP 3: OPTIMAL K DETERMINATION
--------------------------------------------------------------------------------
k=2: Silhouette=0.315, WCSS=96.45
k=3: Silhouette=0.434, WCSS=50.44
k=4: Silhouette=0.364, WCSS=34.26
k=5: Silhouette=0.355, WCSS=19.83
k=6: Silhouette=0.331, WCSS=9.66

Optimal K selected: 3
Best Silhouette Score: 0.434



In [5]:
print("STEP 4: TRAINING FINAL CLUSTERING MODEL")
print("-" * 80)

# Train final model
final_kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10, max_iter=300)
cluster_labels = final_kmeans.fit_predict(X_clustering)

# Calculate final metrics
final_metrics = {
    'n_clusters': optimal_k,
    'silhouette_score': silhouette_score(X_clustering, cluster_labels),
    'calinski_harabasz_score': calinski_harabasz_score(X_clustering, cluster_labels),
    'davies_bouldin_score': davies_bouldin_score(X_clustering, cluster_labels),
    'inertia': final_kmeans.inertia_,
    'n_samples': len(X_clustering),
    'n_features': X_clustering.shape[1],
    'n_iter': final_kmeans.n_iter_
}

print(f"Clustering completed:")
print(f"  Silhouette Score: {final_metrics['silhouette_score']:.4f}")
print(f"  Calinski-Harabasz: {final_metrics['calinski_harabasz_score']:.2f}")
print(f"  Davies-Bouldin: {final_metrics['davies_bouldin_score']:.4f}")
print(f"  Iterations: {final_metrics['n_iter']}\n")

STEP 4: TRAINING FINAL CLUSTERING MODEL
--------------------------------------------------------------------------------
Clustering completed:
  Silhouette Score: 0.4342
  Calinski-Harabasz: 5.47
  Davies-Bouldin: 0.7587
  Iterations: 2



In [6]:
print("STEP 5: CONFIDENCE SCORE CALCULATION")
print("-" * 80)

# Calculate distances to cluster centers
cluster_distances = final_kmeans.transform(X_clustering)
min_distances = np.min(cluster_distances, axis=1)

# Normalize to confidence score (0-1)
max_distance = np.max(min_distances)
confidence_scores = 1 - (min_distances / max_distance) if max_distance > 0 else np.ones(len(X_clustering))

print(f"Confidence scores calculated")
print(f"  Range: {confidence_scores.min():.3f} - {confidence_scores.max():.3f}")
print(f"  Mean: {confidence_scores.mean():.3f}")

# Confidence distribution
high_conf = (confidence_scores >= 0.7).sum()
med_conf = ((confidence_scores >= 0.5) & (confidence_scores < 0.7)).sum()
low_conf = (confidence_scores < 0.5).sum()

print(f"  High (>=0.7): {high_conf} users ({high_conf/len(confidence_scores)*100:.1f}%)")
print(f"  Medium (0.5-0.7): {med_conf} users ({med_conf/len(confidence_scores)*100:.1f}%)")
print(f"  Low (<0.5): {low_conf} users ({low_conf/len(confidence_scores)*100:.1f}%)\n")

STEP 5: CONFIDENCE SCORE CALCULATION
--------------------------------------------------------------------------------
Confidence scores calculated
  Range: 0.000 - 0.983
  Mean: 0.381
  High (>=0.7): 2 users (25.0%)
  Medium (0.5-0.7): 0 users (0.0%)
  Low (<0.5): 6 users (75.0%)



In [10]:
print("STEP 6: LEARNING PATTERN ASSIGNMENT")
print("-" * 80)

def assign_learning_patterns(scaled_data, cluster_labels, optimal_k):
    """Assign clean learning pattern results: only name + description."""

    patterns = {}
    df_analysis = scaled_data.copy()
    df_analysis['cluster'] = cluster_labels

    feature_cols = [col for col in scaled_data.columns if col != 'cluster']

    # Identify actual feature columns
    consistency_col = [col for col in feature_cols if 'consistency' in col.lower()][0]
    completion_col = [col for col in feature_cols if 'completion' in col.lower()][0]
    intensity_col = [col for col in feature_cols if 'intensity' in col.lower()][0]

    for cluster_id in range(optimal_k):
        cluster_data = df_analysis[df_analysis['cluster'] == cluster_id]

        consistency = cluster_data[consistency_col].mean()
        completion = cluster_data[completion_col].mean()
        intensity = cluster_data[intensity_col].mean()

        # Pattern assignment logic
        if consistency > 0.3 and completion >= 0:
            pattern_name = "Consistent Learner"
            description = (
                "Kamu tipe belajar yang slow but steady! Konsistensi kamu tuh keren banget. "
                "Pelan tapi pasti. Setiap langkah kecil tetap punya impact, dan kamu selalu buktiin itu."
            )
        elif intensity > 0.2 and consistency > -0.5:
            pattern_name = "Fast Learner"
            description = (
                "Wah, kamu tipe speed learner! Materi baru langsung nyantol di otak. "
                "Tapi tetep ya, jangan sampai kecepatan bikin kamu skip kualitas!"
            )
        else:
            pattern_name = "Reflective Learner"
            description = (
                "Ini dia deep thinker! Kamu gak terburu-buru belajar, tapi setiap hal yang kamu pelajari "
                "itu benar-benar kamu cerna sampai paham luar dalam."
            )

        patterns[cluster_id] = {
            "pattern_name": pattern_name,
            "description": description
        }

    return patterns


learning_patterns = assign_learning_patterns(scaled_data, cluster_labels, optimal_k)

print("Learning patterns identified:")
for cluster_id, pattern in learning_patterns.items():
    print(f"\n  Cluster {cluster_id}: {pattern['pattern_name']}")
    print(f"    Description: {pattern['description'][:80]}...")
print()

STEP 6: LEARNING PATTERN ASSIGNMENT
--------------------------------------------------------------------------------
Learning patterns identified:

  Cluster 0: Consistent Learner
    Description: Kamu tipe belajar yang slow but steady! Konsistensi kamu tuh keren banget. Pelan...

  Cluster 1: Fast Learner
    Description: Wah, kamu tipe speed learner! Materi baru langsung nyantol di otak. Tapi tetep y...

  Cluster 2: Fast Learner
    Description: Wah, kamu tipe speed learner! Materi baru langsung nyantol di otak. Tapi tetep y...



In [19]:
def get_consistency_description(score):
    if score > 0.3:
        return {
            'category': "High Consistency",
            'description': "Konsistensi kamu kelas expert! Disiplin banget sampai belajar udah kayak bagian dari rutinitas harian kamu."
        }
    elif score > -0.5:
        return {
            'category': "Medium Consistency",
            'description': "Cukup konsisten! Kadang ada jeda kecil, tapi overall kamu masih on track. Bisa makin stabil kalau dibantu reminder atau mini-habit."
        }
    else:
        return {
            'category': "Low Consistency",
            'description': "Belajar kamu masih agak acak nih. Wajar kalau lagi banyak distraksi. Yuk pelan-pelan bangun habit biar ritme makin stabil."
        }

In [14]:
def get_active_time_description(time_period):
    """Map time period to description"""
    descriptions = {
        'Morning': {
            'period_name': 'The Early Bird',
            'time_range': '5 AM - 12 PM',
            'description': 'Kamu di pagi hari tuh kayak HP habis di-charge semalaman. Full battery dan fokus maksimal. Dunia masih sepi, vibe masih calm, dan itu bikin kamu gampang banget nyerap materi.'
        },
        'Afternoon': {
            'period_name': 'The Prime-Time Learner',
            'time_range': '12 PM - 5 PM',
            'description': 'Siang hari itu prime time kamu! Kepala udah gak ngantuk pagi, tapi juga belum masuk mode capek. Kombinasi perfect buat belajar dengan santai tapi tetap produktif.'
        },
        'Evening': {
            'period_name': 'The Sunset Scholar',
            'time_range': '5 PM - 10 PM',
            'description': 'Kamu tipe yang perlu warm-up dulu sebelum masuk learning mode. Pas sore ke malam, kamu baru dapet vibe yang pas lebih tenang, lebih fokus, dan akhirnya produktif banget. Evening study hits different buat kamu!'
        },
        'Late Night': {
            'period_name': 'The Night Owl',
            'time_range': '10 PM - 5 AM',
            'description': 'Kamu belajar saat orang lain udah tepar. Malem sunyi banget, dan itu bikin fokus kamu meningkat tajam. Cuma jangan lupa, quality sleep tetap penting ya! Biar gak jadi zombie keesokan harinya~'
        }
    }

    return descriptions.get(time_period, {
        'period_name': 'Unknown',
        'time_range': 'Unknown',
        'description': 'Data time period not available.'
    })

In [15]:
print("STEP 7: PREDICTIVE MODEL TRAINING")
print("-" * 80)

# Prepare data
X_train, X_test, y_train, y_test = train_test_split(
    X_clustering, cluster_labels, test_size=0.3, random_state=42
)

# Train models
models = {
    'random_forest': RandomForestClassifier(n_estimators=50, random_state=42),
    'knn': KNeighborsClassifier(n_neighbors=3),
    'logistic': LogisticRegression(random_state=42, max_iter=1000)
}

best_score = 0
best_model_name = None
best_model = None

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    if accuracy > best_score:
        best_score = accuracy
        best_model_name = name
        best_model = model

    print(f"{name}: Accuracy={accuracy:.3f}")

print(f"\nBest model: {best_model_name} (Accuracy: {best_score:.3f})")

# Train final predictor on all data
final_predictor = models[best_model_name]
final_predictor.fit(X_clustering, cluster_labels)
print("Final predictor trained on full dataset\n")

STEP 7: PREDICTIVE MODEL TRAINING
--------------------------------------------------------------------------------
random_forest: Accuracy=0.333
knn: Accuracy=0.000
logistic: Accuracy=0.333

Best model: random_forest (Accuracy: 0.333)
Final predictor trained on full dataset



In [24]:
print("STEP 8: USER INSIGHTS GENERATION")
print("-" * 80)

# Get consistency column only (since only 3 features needed)
consistency_col = [col for col in scaled_data.columns if 'consistency' in col.lower()][0]

user_insights = []

for idx in range(len(scaled_data)):
    user_id = user_ids[idx]
    cluster_id = cluster_labels[idx]
    pattern = learning_patterns[cluster_id]

    # Get user info
    user_info = basic_summary[basic_summary['User ID'] == user_id]
    display_name = user_info['Display Name'].iloc[0] if len(user_info) else 'Unknown'
    most_active_time = user_info['Most Active Time'].iloc[0] if len(user_info) else 'Unknown'

    # Consistency score
    consistency_score = float(scaled_data.iloc[idx][consistency_col])

    # Descriptions
    time_description = get_active_time_description(most_active_time)
    consistency_description = get_consistency_description(consistency_score)

    # Final output with ONLY 3 FEATURES
    insight = {
        'user_id': int(user_id),
        'display_name': display_name,

        # 1. Learning Pattern
        'learning_pattern': {
            'name': pattern['pattern_name'],
            'description': pattern['description'],
        },

        # 2. Most Active Time
        'most_active_time': {
            'period': most_active_time,
            'period_name': time_description['period_name'],
            'description': time_description['description']
        },

        # 3. Consistency Score
        'consistency': {
    'category': consistency_description['category'],
    'description': consistency_description['description']
}


    }

    user_insights.append(insight)

print(f"Generated insights for {len(user_insights)} users")
print("\nUser insight:")
print(json.dumps(user_insights[0], indent=4))

STEP 8: USER INSIGHTS GENERATION
--------------------------------------------------------------------------------
Generated insights for 8 users

User insight:
{
    "user_id": 96989,
    "display_name": "igihcksn",
    "learning_pattern": {
        "name": "Fast Learner",
        "description": "Wah, kamu tipe speed learner! Materi baru langsung nyantol di otak. Tapi tetep ya, jangan sampai kecepatan bikin kamu skip kualitas!"
    },
    "most_active_time": {
        "period": "Afternoon",
        "period_name": "The Prime-Time Learner",
        "description": "Siang hari itu prime time kamu! Kepala udah gak ngantuk pagi, tapi juga belum masuk mode capek. Kombinasi perfect buat belajar dengan santai tapi tetap produktif."
    },
    "consistency": {
        "category": "High Consistency",
        "description": "Konsistensi kamu kelas expert! Disiplin banget sampai belajar udah kayak bagian dari rutinitas harian kamu."
    }
}


In [27]:
print("STEP 9: TIME PERIOD DISTRIBUTION CALCULATION")
print("-" * 80)

# Load tracking data dari Excel
trackings_time = pd.read_excel('developer_journey_trackings.xlsx')
print(f"Loaded {len(trackings_time)} tracking records")

# Process datetime
trackings_time['last_viewed'] = pd.to_datetime(trackings_time['last_viewed'])
trackings_time['hour'] = trackings_time['last_viewed'].dt.hour

def get_time_period(hour):
    if pd.isna(hour): return None
    if 5 <= hour < 12: return 'Morning'
    elif 12 <= hour < 17: return 'Afternoon'
    elif 17 <= hour < 22: return 'Evening'
    else: return 'Late Night'

trackings_time['time_period'] = trackings_time['hour'].apply(get_time_period)

# Ensure correct column name
if 'developer_id' in trackings_time.columns:
    trackings_time.rename(columns={'developer_id': 'user_id'}, inplace=True)

# Calculate distribution per user
time_distribution = (
    trackings_time.groupby(['user_id', 'time_period'])
    .size()
    .reset_index(name='access_count')
)

# Pivot to wide format
time_dist_pivot = time_distribution.pivot(
    index='user_id',
    columns='time_period',
    values='access_count'
).fillna(0).astype(int).reset_index()

# Ensure all periods exist
for period in ['Morning', 'Afternoon', 'Evening', 'Late Night']:
    if period not in time_dist_pivot.columns:
        time_dist_pivot[period] = 0

# Calculate percentages
time_dist_pivot['total'] = (
    time_dist_pivot['Morning'] + time_dist_pivot['Afternoon'] +
    time_dist_pivot['Evening'] + time_dist_pivot['Late Night']
)

for period in ['Morning', 'Afternoon', 'Evening', 'Late Night']:
    time_dist_pivot[f'{period}_percentage'] = (
        (time_dist_pivot[period] / time_dist_pivot['total'] * 100)
        .round(1)
        .fillna(0)
    )

print(f"Time distribution calculated for {len(time_dist_pivot)} users\n")

STEP 9: TIME PERIOD DISTRIBUTION CALCULATION
--------------------------------------------------------------------------------
Loaded 6641 tracking records
Time distribution calculated for 8 users



In [31]:
print("STEP 10: SAVE MODELS & ARTIFACTS")
print("-" * 80)

# Create output directory
output_dir = 'ml_models_output'
os.makedirs(output_dir, exist_ok=True)

# 1. Save clustering model
joblib.dump(final_kmeans, f'{output_dir}/clustering_model.pkl')
print("Clustering model saved")

# 2. Save predictor model
joblib.dump(final_predictor, f'{output_dir}/predictor_model.pkl')
print("Predictor model saved")

# 3. Save learning patterns
with open(f'{output_dir}/learning_patterns.json', 'w') as f:
    json.dump(learning_patterns, f, indent=4)
print("Learning patterns saved")

# 4. Save user insights (with only 3 features)
with open(f'{output_dir}/user_insights.json', 'w') as f:
    json.dump(user_insights, f, indent=4)
print("User insights saved")

# 5. Save clustering results
results_df = pd.DataFrame({
    'user_id': user_ids,
    'cluster': cluster_labels,
    'confidence_score': confidence_scores,
    'learning_pattern': [learning_patterns[c]['pattern_name'] for c in cluster_labels]
})
results_df.to_csv(f'{output_dir}/clustering_results.csv', index=False)
print("Clustering results saved")

# 6. Save time distribution
time_dist_pivot.to_csv(f'{output_dir}/time_distribution.csv', index=False)
print("Time distribution saved")

# 7. Save model metadata
metadata = {
    'training_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'n_samples': len(X_clustering),
    'n_features': X_clustering.shape[1],
    'optimal_k': optimal_k,
    'feature_names': feature_names,
    'metrics': final_metrics,
    'confidence_distribution': {
        'high_confidence': int(high_conf),
        'medium_confidence': int(med_conf),
        'low_confidence': int(low_conf)
    },
    'predictor_model': best_model_name,
    'predictor_accuracy': float(best_score)
}

with open(f'{output_dir}/model_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=4)
print("Model metadata saved")

# 8. Save API documentation
api_doc = {
    'version': '1.0',
    'endpoints': {
        'user_insight': {
            'method': 'GET',
            'path': '/api/v1/user-insight/{user_id}',
            'description': 'Get complete user insight with 3 features: Learning Pattern, Most Active Time, and Consistency',
            'output_features': {
                'learning_pattern': {
                    'name': 'string',
                    'description': 'string'
                },
                'most_active_time': {
                    'period': 'string',
                    'period_name': 'string',
                    'description': 'string'
                },
                'consistency': {
                    'category': 'string',
                    'description': 'string'
                }
            }
        },
        'time_distribution': {
            'method': 'GET',
            'path': '/api/v1/time-distribution/{user_id}',
            'description': 'Get time period distribution for a user',
            'output': {
                'user_id': 'integer',
                'Morning': 'integer (count)',
                'Afternoon': 'integer (count)',
                'Evening': 'integer (count)',
                'Late Night': 'integer (count)',
                'Morning_percentage': 'float',
                'Afternoon_percentage': 'float',
                'Evening_percentage': 'float',
                'Late Night_percentage': 'float',
                'total': 'integer'
            }
        },
        'predict_cluster': {
            'method': 'POST',
            'path': '/api/v1/predict-cluster',
            'description': 'Predict cluster for new user based on features',
            'input': 'Array of 48 scaled features',
            'output': {
                'predicted_cluster': 'integer',
                'learning_pattern_name': 'string',
                'confidence_score': 'float (0-1)'
            }
        }
    },
    'models': {
        'clustering': 'KMeans',
        'predictor': best_model_name,
        'n_clusters': optimal_k
    }
}

with open(f'{output_dir}/api_documentation.json', 'w') as f:
    json.dump(api_doc, f, indent=4)
print("API documentation saved")

print("\n" + "=" * 80)
print("ALL MODELS & ARTIFACTS SAVED SUCCESSFULLY")
print("=" * 80)
print(f"Output directory: {output_dir}/")
print(f"Files saved: 8")
print("\nFiles:")
print("  1. clustering_model.pkl")
print("  2. predictor_model.pkl")
print("  3. learning_patterns.json")
print("  4. user_insights.json")
print("  5. clustering_results.csv")
print("  6. time_distribution.csv")
print("  7. model_metadata.json")
print("  8. api_documentation.json")
print("\nReady for deployment!")

STEP 10: SAVE MODELS & ARTIFACTS
--------------------------------------------------------------------------------
Clustering model saved
Predictor model saved
Learning patterns saved
User insights saved
Clustering results saved
Time distribution saved
Model metadata saved
API documentation saved

ALL MODELS & ARTIFACTS SAVED SUCCESSFULLY
Output directory: ml_models_output/
Files saved: 8

Files:
  1. clustering_model.pkl
  2. predictor_model.pkl
  3. learning_patterns.json
  4. user_insights.json
  5. clustering_results.csv
  6. time_distribution.csv
  7. model_metadata.json
  8. api_documentation.json

Ready for deployment!
