Nama : Anjelia Hidayat

Nim  : 09011182429012

Kelas : SK3A

Mata Kuliah : Data Mining


Proses Exploratory Data Analysis (EDA)

In [None]:
# -----------------------------
# Memuat Data
# Muat dataset ke dalam Pandas DataFrame.
# -----------------------------

import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/dataset.kaggle/Foodpanda Analysis Dataset.csv')
df.head(10)

In [None]:
# -----------------------------
# Eksplorasi Data Awal
# Lakukan pemeriksaan awal data, termasuk melihat beberapa baris pertama, informasi umum (tipe data, non-null counts), dan ringkasan statistik.
# -----------------------------

import pandas as pd

# Load the dataset if df is not defined (to ensure cell runs independently)
try:
    df.head() # Try to access df to check if it's already defined
except NameError:
    df = pd.read_csv('/content/drive/MyDrive/dataset.kaggle/Foodpanda Analysis Dataset.csv')

display(df.head())
display(df.info())


In [None]:
# -----------------------------
# Membersihkan Data
# Mengidentifikasi dan tangani nilai yang hilang atau duplikat jika ada.
# -----------------------------

missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values)

duplicate_rows = df.duplicated().sum()
print("\nTotal number of duplicate rows:", duplicate_rows)

In [None]:
# -----------------------------
# Deteksi outlier
# -----------------------------

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats

# Define numerical columns for outlier detection
num_cols = ['quantity', 'price', 'order_frequency', 'loyalty_points', 'rating']

# Visualisasi Boxplot untuk deteksi outlier
plt.figure(figsize=(8, 5))
sns.boxplot(data=df[num_cols])
plt.title("Deteksi Outlier Menggunakan Boxplot (IQR)")
plt.xticks(rotation=45)
plt.show()

print("\n=== Deteksi Outlier Menggunakan Z-Score ===")
z_scores = np.abs(stats.zscore(df[num_cols]))
threshold = 3

outliers_z = df[(z_scores > threshold).any(axis=1)]
print("Jumlah Outlier (Z-Score):", len(outliers_z))

if len(outliers_z) > 0:
    print("\n=== Contoh Outlier (Z-Score) ===")
    display(outliers_z.head())
else:
    print("Tidak ada outlier yang terdeteksi menggunakan Z-Score (threshold=3).")

In [None]:
# -----------------------------
# Analisis Data Deskriptif
# Hitung statistik deskriptif untuk kolom numerik
# -----------------------------

display(df.describe())

In [None]:
# -----------------------------
# Analisis Data Deskriptif
# Hitung statistik deskriptif untuk kolom non-numerik
# -----------------------------

display(df.describe(include='object'))

In [None]:
# -----------------------------
# Analisis Data Univariat
# Analisis distribusi setiap variabel secara individual (misalnya, menggunakan histogram, box plot, atau count plot).
# -----------------------------

import matplotlib.pyplot as plt
import seaborn as sns

numerical_cols = ['quantity', 'price', 'order_frequency', 'loyalty_points', 'rating']
categorical_cols = ['gender', 'age', 'city', 'category', 'payment_method', 'churned', 'delivery_status']

# Plotting distributions for numerical columns
plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols):
    plt.subplot(2, 3, i + 1)
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.show()

# Plotting distributions for categorical columns
plt.figure(figsize=(15, 10))
for i, col in enumerate(categorical_cols):
    plt.subplot(3, 3, i + 1)
    sns.countplot(data=df, x=col, palette='viridis')
    plt.title(f'Distribution of {col}')
    plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# -----------------------------
# Analisis Data Bivariat
# Analisis hubungan antara pasangan variabel (misalnya, menggunakan scatter plot, bar plot, atau correlation matrix).
# -----------------------------

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# =====================================================
# 1️⃣ Pair 1: price vs loyalty_points
# =====================================================
plt.figure(figsize=(6,4))
sns.histplot(df['price'], kde=True, color='skyblue')
plt.title('Histogram of Price')
plt.show()

# --- Boxplot seperti contoh gambar ---
plt.figure(figsize=(8,5))
sns.boxplot(
    data=df, x='loyalty_points', y='price',
    palette='Set2', showmeans=True,
    meanprops={"marker": "o", "markerfacecolor": "white", "markeredgecolor": "black"}
)
plt.title('Boxplot of Price by Loyalty Points')
plt.xlabel('Loyalty Points')
plt.ylabel('Price')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# --- Scatterplot ---
plt.figure(figsize=(6,4))
sns.scatterplot(data=df, x='price', y='loyalty_points')
plt.title('Scatterplot of Price vs Loyalty Points')
plt.show()

# --- Barplot seperti contoh gambar ---
plt.figure(figsize=(8,5))
sns.barplot(
    data=df, x='loyalty_points', y='price',
    palette='husl', errorbar=None
)
plt.title('Average Price by Loyalty Points')
plt.xlabel('Loyalty Points')
plt.ylabel('Average Price')
plt.tight_layout()
plt.show()

# --- Correlation Matrix ---
plt.figure(figsize=(5,4))
sns.heatmap(df[['price', 'loyalty_points']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix: Price & Loyalty Points')
plt.show()



In [None]:
# =====================================================
# 2️⃣ Pair 2: price vs rating
# =====================================================
plt.figure(figsize=(6,4))
sns.histplot(df['rating'], kde=True, color='lightgreen')
plt.title('Histogram of Rating')
plt.show()

plt.figure(figsize=(8,5))
sns.boxplot(
    data=df, x='rating', y='price',
    palette='Set3', showmeans=True,
    meanprops={"marker": "o", "markerfacecolor": "white", "markeredgecolor": "black"}
)
plt.title('Boxplot of Price by Rating')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

plt.figure(figsize=(6,4))
sns.scatterplot(data=df, x='price', y='rating')
plt.title('Scatterplot of Price vs Rating')
plt.show()

plt.figure(figsize=(8,5))
sns.barplot(
    data=df, x='rating', y='price',
    palette='husl', errorbar=None
)
plt.title('Average Price by Rating')
plt.tight_layout()
plt.show()

plt.figure(figsize=(5,4))
sns.heatmap(df[['price', 'rating']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix: Price & Rating')
plt.show()

In [None]:
# =====================================================
# 3️⃣ Pair 3: rating vs loyalty_points
# =====================================================
plt.figure(figsize=(6,4))
sns.histplot(df['loyalty_points'], kde=True, color='orange')
plt.title('Histogram of Loyalty Points')
plt.show()

plt.figure(figsize=(8,5))
sns.boxplot(
    data=df, x='rating', y='loyalty_points',
    palette='Set2', showmeans=True,
    meanprops={"marker": "o", "markerfacecolor": "white", "markeredgecolor": "black"}
)
plt.title('Boxplot of Loyalty Points by Rating')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

plt.figure(figsize=(6,4))
sns.scatterplot(data=df, x='rating', y='loyalty_points')
plt.title('Scatterplot of Rating vs Loyalty Points')
plt.show()

plt.figure(figsize=(8,5))
sns.barplot(
    data=df, x='rating', y='loyalty_points',
    palette='husl', errorbar=None
)
plt.title('Average Loyalty Points by Rating')
plt.tight_layout()
plt.show()

plt.figure(figsize=(5,4))
sns.heatmap(df[['rating', 'loyalty_points']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix: Rating & Loyalty Points')
plt.show()


In [None]:
# =====================================================
# 4️⃣ Pair 4: category vs price
# =====================================================
plt.figure(figsize=(6,4))
sns.histplot(df['price'], kde=True, color='pink')
plt.title('Histogram of Price')
plt.show()

plt.figure(figsize=(8,5))
sns.boxplot(
    data=df, x='category', y='price', hue='category', # Explicitly set hue
    palette='Set3', showmeans=True, legend=False, # Set legend=False
    meanprops={"marker": "o", "markerfacecolor": "white", "markeredgecolor": "black"}
)
plt.title('Boxplot of Price by Category')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

plt.figure(figsize=(8,4))
sns.stripplot(data=df, x='category', y='price', jitter=True, hue='category', palette='Set2', legend=False)
plt.title('Scatter-like Plot of Category vs Price')
plt.xticks(rotation=45)
plt.show()

plt.figure(figsize=(8,5))
sns.barplot(
    data=df, x='category', y='price', hue='category', # Explicitly set hue
    palette='husl', errorbar=None, legend=False # Set legend=False
)
plt.title('Average Price by Category')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

plt.figure(figsize=(5,4))
sns.heatmap(df[['price']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation (Price Only)')
plt.show()

In [None]:
# =====================================================
# 5️⃣ Pair 5: churned vs loyalty_points
# =====================================================
plt.figure(figsize=(6,4))
sns.histplot(df['loyalty_points'], kde=True, color='violet')
plt.title('Histogram of Loyalty Points')
plt.show()

plt.figure(figsize=(8,5))
sns.boxplot(
    data=df, x='churned', y='loyalty_points',
    palette='Set2', showmeans=True,
    meanprops={"marker": "o", "markerfacecolor": "white", "markeredgecolor": "black"}
)
plt.title('Boxplot of Loyalty Points by Churned Status')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

plt.figure(figsize=(6,4))
sns.stripplot(data=df, x='churned', y='loyalty_points', jitter=True, palette='Set3')
plt.title('Scatter-like Plot of Churned vs Loyalty Points')
plt.show()

plt.figure(figsize=(8,5))
sns.barplot(
    data=df, x='churned', y='loyalty_points',
    palette='husl', errorbar=None
)
plt.title('Average Loyalty Points by Churned Status')
plt.tight_layout()
plt.show()

plt.figure(figsize=(5,4))
sns.heatmap(df[['loyalty_points']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation (Loyalty Points Only)')
plt.show()

In [None]:
# ============================================================
# Heatmap Korelasi
# ============================================================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset (assuming the file path is correct)
try:
    df = pd.read_csv('/content/drive/MyDrive/dataset.kaggle/Foodpanda Analysis Dataset.csv')
except FileNotFoundError:
    print("Error: Dataset file not found. Please check the file path.")
    exit()

plt.figure(figsize=(8,6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Heatmap Korelasi Variabel Numerik")
plt.show()

Pre-Processing data


In [None]:
# ======================================
# Foodpanda Dataset Preprocessing
# ======================================

import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

# 1⍡ Baca dataset
df = pd.read_csv("/content/drive/MyDrive/dataset.kaggle/Foodpanda Analysis Dataset.csv")




In [None]:
# ======================================
# Mengidentifikasi dan menangani nilai yang hilang jika ada.
# ======================================

missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values)

In [None]:
# ======================================
# Mengidentifikasi dan menangani nilai duplikat jika ada.
# ======================================

duplicate_rows = df.duplicated().sum()
print("\nTotal number of duplicate rows:", duplicate_rows)

In [None]:
# ======================================
# Hapus kolom ID yang tidak berguna
# ======================================

df.columns = df.columns.str.strip()
df = df.drop(columns=['customer_id', 'order_id'], errors='ignore')

In [None]:
# ======================================
# Ubah kolom tanggal ke datetime
# ======================================

date_cols = ['signup_date', 'order_date', 'last_order_date', 'rating_date']
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors='coerce')

In [None]:
# ======================================
# Encode kolom kategorikal
# ======================================

label_enc = LabelEncoder()
cat_cols = ['gender', 'age', 'city', 'restaurant_name', 'dish_name',
            'category', 'payment_method', 'churned']

for col in cat_cols:
    df[col] = label_enc.fit_transform(df[col])

In [None]:
# ======================================
# Encode target 'delivery_status' before it's moved to y
# ======================================

target_enc = LabelEncoder()
df['delivery_status'] = target_enc.fit_transform(df['delivery_status'])


In [None]:
# ======================================
# Standarisasi kolom numerik
# ======================================

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
num_cols = ['quantity', 'price', 'order_frequency', 'loyalty_points', 'rating']
df[num_cols] = scaler.fit_transform(df[num_cols])
df.head(10)

In [None]:
# ======================================
# Deteksi outlier menggunakan IQR
# ======================================

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats

# Define numerical columns for outlier detection
num_cols = ['quantity', 'price', 'order_frequency', 'loyalty_points', 'rating']

print("\n=== Deteksi Outlier Menggunakan IQR ===")
outlier_indices = {}

for col in num_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    outlier_indices[col] = outliers.index.tolist()

    print(f"Outlier pada kolom {col}: {len(outliers)} data")


In [None]:
# ======================================
# Visualisasi Boxplot outlier
# ======================================

plt.figure(figsize=(8, 5))
sns.boxplot(data=df[num_cols])
plt.title("Deteksi Outlier Menggunakan Boxplot (IQR)")
plt.xticks(rotation=45)
plt.show()

print("\n=== Deteksi Outlier Menggunakan Z-Score ===")
z_scores = np.abs(stats.zscore(df[num_cols]))
threshold = 3

outliers_z = df[(z_scores > threshold).any(axis=1)]
print("Jumlah Outlier (Z-Score):", len(outliers_z))

if len(outliers_z) > 0:
    print("\n=== Contoh Outlier (Z-Score) ===")
    display(outliers_z.head())
else:
    print("Tidak ada outlier yang terdeteksi menggunakan Z-Score (threshold=3).")

In [None]:
# ======================================
# Check Preprocessing Results (first 10 rows)
# ======================================

print("\n=== 10 Baris Pertama Setelah Preprocessing ===")
print(df.head(10))

# Save Preprocessed Data

df.to_csv("Foodpanda_Preprocessed.csv", index=False)
print("\n✅ File 'Foodpanda_Preprocessed.csv' berhasil disimpan.")

Modelling

In [None]:
# ======================================
# Separate features (X) and target (y)
# ======================================

y = df['delivery_status']
X = df.drop(columns=['delivery_status', 'signup_date', 'order_date', 'last_order_date', 'rating_date'])

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# ======================================
# KNeighborsClassifier
# ======================================

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

# 1. Initialize a KNeighborsClassifier model
knn_model = KNeighborsClassifier(n_neighbors=5)

# 2. Train the KNN model
knn_model.fit(X_train, y_train)

# 3. Make predictions on the test data
y_pred_knn = knn_model.predict(X_test)

# 4. Print the classification report
print("=== Classification Report (KNN) ===")
print(classification_report(y_test, y_pred_knn))

# 5. Print the confusion matrix
print("\n=== Confusion Matrix (KNN) ===")
print(confusion_matrix(y_test, y_pred_knn))

In [None]:
# ==================================================
# Visualisasi Confusion matrix KNeighborsClassifier
# ==================================================

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix for KNN
cm_knn = confusion_matrix(y_test, y_pred_knn)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_knn, annot=True, fmt='d', cmap='Blues',
            xticklabels=target_enc.classes_, yticklabels=target_enc.classes_)
plt.title('Confusion Matrix for K-Nearest Neighbors (KNN) Model')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
# ======================================
# DecisionTreeClassifier
# ======================================

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

# 1. Initialize a Decision Tree Classifier model
dt_model = DecisionTreeClassifier(random_state=42)

# 2. Train the Decision Tree model
dt_model.fit(X_train, y_train)

# 3. Make predictions on the test data
y_pred_dt = dt_model.predict(X_test)

# 4. Print the classification report
print("=== Classification Report (Decision Tree) ===")
print(classification_report(y_test, y_pred_dt))

# 5. Print the confusion matrix
print("\n=== Confusion Matrix (Decision Tree) ===")
print(confusion_matrix(y_test, y_pred_dt))

In [None]:
# ====================================================
# Visualisasi Confusion matrix DecisionTreeClassifier
# ====================================================

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix for Decision Tree
cm_dt = confusion_matrix(y_test, y_pred_dt)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_dt, annot=True, fmt='d', cmap='Blues',
            xticklabels=target_enc.classes_, yticklabels=target_enc.classes_)
plt.title('Confusion Matrix for Decision Tree Model')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
# ==========================================
# Visualisasi pohon DecisionTreeClassifier
# ===========================================

import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

plt.figure(figsize=(20, 10))
plot_tree(dt_model, filled=True, rounded=True, feature_names=X.columns.tolist(), class_names=[str(c) for c in target_enc.classes_])
plt.title('Decision Tree Visualization')
plt.show()

In [None]:
# =========================
# RandomForestClassifier
# =========================

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# 1. Initialize a RandomForestClassifier model
rf_model = RandomForestClassifier(random_state=42)

# 2. Train the Random Forest model
rf_model.fit(X_train, y_train)

# 3. Make predictions on the test data
y_pred_rf = rf_model.predict(X_test)

# 4. Print the classification report
print("=== Classification Report (Random Forest) ===")
print(classification_report(y_test, y_pred_rf))

# 5. Print the confusion matrix
print("\n=== Confusion Matrix (Random Forest) ===")
print(confusion_matrix(y_test, y_pred_rf))

In [None]:
# ====================================================
# Visualisasi confusion matrix RandomForestClassifier
# ====================================================

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix for Random Forest
cm_rf = confusion_matrix(y_test, y_pred_rf)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues',
            xticklabels=target_enc.classes_, yticklabels=target_enc.classes_)
plt.title('Confusion Matrix for Random Forest Model')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
# ===============================
# Hasil akurasi ketiga model
# ===============================

import pandas as pd

# Create a DataFrame to store model accuracies
model_accuracies = pd.DataFrame({
    'Model': ['K-Nearest Neighbors', 'Decision Tree', 'Random Forest'],
    'Accuracy': [accuracy_knn, accuracy_dt, accuracy_rf]
})

display(model_accuracies)

In [None]:
# =================================================
# Visualisasi perbandingan akurasi ketiga model
# =================================================

import matplotlib.pyplot as plt
import seaborn as sns

# Visualize the accuracy comparison
plt.figure(figsize=(10, 6))
sns.barplot(x='Model', y='Accuracy', data=model_accuracies, palette='viridis')
plt.ylim(0, 1) # Accuracy is between 0 and 1
plt.title('Perbandingan Akurasi Model Klasifikasi')
plt.xlabel('Model')
plt.ylabel('Akurasi')
plt.show()
