# Pre-processing Dataset

In [None]:
!pip install pandas
!pip install scikit-learn



In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [None]:
file_path = 'fitness_tracker_dataset.csv'
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,user_id,date,steps,calories_burned,distance_km,active_minutes,sleep_hours,heart_rate_avg,workout_type,weather_conditions,location,mood
0,468,2023-01-01,4530,2543.02,16.1,613,1.5,176,Walking,Clear,Park,Tired
1,879,2023-01-01,11613,1720.76,8.1,352,6.3,128,Cycling,Fog,Park,Happy
2,152,2023-01-01,27335,1706.35,3.57,236,6.7,134,Yoga,Snow,Park,Neutral
3,311,2023-01-01,13459,2912.38,6.41,1329,11.6,116,Swimming,Rain,Office,Tired
4,759,2023-01-01,15378,3344.51,17.88,52,7.4,84,Swimming,Rain,Office,Neutral


Hapus Kolom tidak Relevan

In [None]:
data.drop(columns=['user_id', 'date'], inplace=True)

Handle Missing Values

In [None]:
for column in data.columns:
    if data[column].dtype in ['int64', 'float64']:
        data[column] = data[column].fillna(data[column].mean())
    else:
        data[column] = data[column].fillna(data[column].mode()[0])

Tambah Fitur

In [None]:
data['calories_per_minute'] = data['calories_burned'] / (data['active_minutes'] + 1e-6)
data.head()

Unnamed: 0,steps,calories_burned,distance_km,active_minutes,sleep_hours,heart_rate_avg,workout_type,weather_conditions,location,mood,calories_per_minute
0,4530,2543.02,16.1,613,1.5,176,Walking,Clear,Park,Tired,4.148483
1,11613,1720.76,8.1,352,6.3,128,Cycling,Fog,Park,Happy,4.888523
2,27335,1706.35,3.57,236,6.7,134,Yoga,Snow,Park,Neutral,7.230297
3,13459,2912.38,6.41,1329,11.6,116,Swimming,Rain,Office,Tired,2.191407
4,15378,3344.51,17.88,52,7.4,84,Swimming,Rain,Office,Neutral,64.317499


Encode

In [None]:
categorical_cols = ['workout_type', 'weather_conditions', 'mood']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col + '_encoded'] = le.fit_transform(data[col])
    label_encoders[col] = le

Normalisasi

In [None]:
numerical_cols = ['steps', 'calories_burned', 'distance_km', 'active_minutes', 'sleep_hours', 'heart_rate_avg', 'calories_per_minute']
scaler = MinMaxScaler()
for col in numerical_cols:
    data[col + '_normalized'] = scaler.fit_transform(data[[col]])
cleaned_file_path = 'cleaned_dataset.csv'
data.to_csv(cleaned_file_path, index=False)

data.head()

Unnamed: 0,steps,calories_burned,distance_km,active_minutes,sleep_hours,heart_rate_avg,workout_type,weather_conditions,location,mood,...,workout_type_encoded,weather_conditions_encoded,mood_encoded,steps_normalized,calories_burned_normalized,distance_km_normalized,active_minutes_normalized,sleep_hours_normalized,heart_rate_avg_normalized,calories_per_minute_normalized
0,4530,2543.02,16.1,613,1.5,176,Walking,Clear,Park,Tired,...,4,0,3,0.151005,0.417208,0.805,0.42599,0.125,0.97479,7.767436e-10
1,11613,1720.76,8.1,352,6.3,128,Cycling,Fog,Park,Happy,...,0,1,0,0.387113,0.088304,0.405,0.244614,0.525,0.571429,9.618438e-10
2,27335,1706.35,3.57,236,6.7,134,Yoga,Snow,Park,Neutral,...,5,3,1,0.911197,0.08254,0.1785,0.164003,0.558333,0.621849,1.547573e-09
3,13459,2912.38,6.41,1329,11.6,116,Swimming,Rain,Office,Tired,...,3,2,3,0.448648,0.564952,0.3205,0.923558,0.966667,0.470588,2.872361e-10
4,15378,3344.51,17.88,52,7.4,84,Swimming,Rain,Office,Neutral,...,3,2,1,0.512617,0.737804,0.894,0.036136,0.616667,0.201681,1.582633e-08


Bagi Dataset Train (80%) dan Test (20%)

In [None]:
train_data = data.sample(frac=0.8, random_state=42)
test_data = data.drop(train_data.index)

train_file_path = 'train_dataset.csv'
test_file_path = 'test_dataset.csv'
train_data.to_csv(train_file_path, index=False)
test_data.to_csv(test_file_path, index=False)

print("Dataset telah dibagi dan disimpan.")

Dataset telah dibagi dan disimpan.


# Implementasi Decision Tree

Membagi data berdasarkan atribut dan nilai tertentu.

In [None]:
def split_data(data, attribute, value):
    if data[attribute].dtype in ['int64', 'float64']:
        # Jika atribut numerik, pakai pembagian <= dan >
        subset1 = data[data[attribute] <= value]
        subset2 = data[data[attribute] > value]
    else:
        # Jika atribut kategorikal, pakai pembagian berdasarkan kesamaan
        subset1 = data[data[attribute] == value]
        subset2 = data[data[attribute] != value]

    return subset1, subset2

In [None]:
# Cek rentang nilai kolom calories_burned_normalized
print("Range calories_burned_normalized:",
      train_data['calories_burned_normalized'].min(), "to", train_data['calories_burned_normalized'].max())

# Lihat distribusi data untuk nilai 0.5
subset1, subset2 = split_data(train_data, 'calories_burned_normalized', 0.5)

print("Subset 1 size:", len(subset1))
print("Subset 2 size:", len(subset2))

# Statistik subset1 dan subset2
print("Subset1 - Max:", subset1['calories_burned_normalized'].max(),
      ", Min:", subset1['calories_burned_normalized'].min())
print("Subset2 - Max:", subset2['calories_burned_normalized'].max(),
      ", Min:", subset2['calories_burned_normalized'].min())

Range calories_burned_normalized: 0.0 to 1.0
Subset 1 size: 400648
Subset 2 size: 399352
Subset1 - Max: 0.499996 , Min: 0.0
Subset2 - Max: 1.0 , Min: 0.5000000000000001


Implementasikan fungsi perhitungan impurity seperti Gini Index

In [None]:
def gini_index(groups, classes):
    # Hitung total sampel di semua grup
    total_samples = sum([len(group) for group in groups])

    # Inisialisasi Gini Index
    gini = 0.0

    # Iterasi untuk setiap grup
    for group in groups:
        size = len(group)
        # Hindari pembagian oleh nol
        if size == 0:
            continue

        # Hitung proporsi setiap kelas dalam grup
        score = 0.0
        for class_val in classes:
            proportion = (group['workout_type_encoded'] == class_val).sum() / size
            score += proportion ** 2

        # Gini Index untuk grup
        gini += (1.0 - score) * (size / total_samples)

    return gini

In [None]:
# Bagi data berdasarkan attribute tertentu
subset1, subset2 = split_data(train_data, 'calories_burned_normalized', 0.5)

# Hitung Gini Index
workout_classes = train_data['workout_type_encoded'].unique()
gini = gini_index([subset1, subset2], workout_classes)

print("Gini Index:", gini)

Gini Index: 0.8160975945782634


Fungsi untuk membuat node split di decision tree.

In [None]:
def find_best_split(data, target_column):
    best_attribute, best_value, best_score, best_groups = None, None, float('inf'), None
    classes = data[target_column].unique()

    for attribute in data.columns:
        if attribute == target_column or '_encoded' not in attribute and '_normalized' not in attribute:
            # Abaikan kolom target atau kolom yang tidak relevan (tidak diencode/normalisasi)
            continue

        # Periksa setiap nilai unik di atribut
        for value in data[attribute].unique():
            subset1, subset2 = split_data(data, attribute, value)
            gini = gini_index([subset1, subset2], classes)

            if gini < best_score:
                best_attribute, best_value, best_score, best_groups = attribute, value, gini, (subset1, subset2)

    return best_attribute, best_value, best_score, best_groups

In [None]:
# Cari split terbaik di data train
train_sample = train_data.sample(10000, random_state=42)
best_attr, best_val, best_score, best_groups = find_best_split(train_sample, 'workout_type_encoded')

print("Best Attribute:", best_attr)
print("Best Value:", best_val)
print("Best Gini Index:", best_score)
print("Subset Sizes:", len(best_groups[0]), "and", len(best_groups[1]))

Best Attribute: steps_normalized
Best Value: 0.8434281142704757
Best Gini Index: 0.8176119226784817
Subset Sizes: 8418 and 1582


In [None]:
print("Distribusi kelas di Subset 1:")
print(best_groups[0]['workout_type_encoded'].value_counts())

print("Distribusi kelas di Subset 2:")
print(best_groups[1]['workout_type_encoded'].value_counts())

Distribusi kelas di Subset 1:
workout_type_encoded
0    2400
3    1250
4    1248
2    1196
1    1195
5    1129
Name: count, dtype: int64
Distribusi kelas di Subset 2:
workout_type_encoded
0    391
2    286
1    238
5    234
3    217
4    216
Name: count, dtype: int64


In [None]:
total_rows = len(best_groups[0]) + len(best_groups[1])
print("Total baris di kedua subset:", total_rows)
print("Jumlah baris di dataset awal:", len(train_sample))

Total baris di kedua subset: 10000
Jumlah baris di dataset awal: 10000


Bangun decision tree rekursif hingga mencapai kondisi leaf

In [None]:
import numpy as np

class DecisionTree:
    def __init__(self, max_depth=5, min_size=5, max_features=None):
        self.max_depth = max_depth
        self.min_size = min_size
        self.max_features = max_features  # Jumlah fitur acak yang akan dipertimbangkan
        self.tree = None

    def fit(self, data, target_column):
        """
        Membangun tree dari data training.
        """
        self.tree = self._split(data, target_column, depth=1)

    def _split(self, data, target_column, depth):
        """
        Fungsi rekursif untuk membagi node.
        """
        # Hitung distribusi kelas
        classes = data[target_column].unique()

        # Kondisi leaf: semua kelas sama, data kecil, atau kedalaman maksimum tercapai
        if len(classes) == 1 or len(data) <= self.min_size or depth >= self.max_depth:
            leaf_value = data[target_column].mode()[0]  # Mayoritas kelas sebagai hasil leaf
            return {'leaf': True, 'value': leaf_value}

        features = data.drop(columns=[target_column]).columns.tolist()
        if self.max_features is not None:
            features = np.random.choice(features, self.max_features, replace=False)

        # Cari split terbaik dengan subset fitur acak
        best_attr, best_val, best_score, best_groups = self.find_best_split(data, target_column, features)

        # Jika tidak ada split yang valid
        if best_score == float('inf'):
            leaf_value = data[target_column].mode()[0]
            return {'leaf': True, 'value': leaf_value}

        # Rekursi untuk child nodes
        left_child = self._split(best_groups[0], target_column, depth + 1)
        right_child = self._split(best_groups[1], target_column, depth + 1)

        # Simpan informasi node
        return {
            'leaf': False,
            'attribute': best_attr,
            'value': best_val,
            'left': left_child,
            'right': right_child
        }

    def find_best_split(self, data, target_column, features):
        """
        Cari split terbaik menggunakan subset fitur yang dipilih secara acak.
        """
        best_score = float('inf')
        best_attr = None
        best_val = None
        best_groups = None

        for feature in features:
            values = data[feature].unique()
            for value in values:
                left_group = data[data[feature] <= value]
                right_group = data[data[feature] > value]

                if len(left_group) >= self.min_size and len(right_group) >= self.min_size:
                    score = self.gini_index(left_group, right_group, target_column)

                    if score < best_score:
                        best_score = score
                        best_attr = feature
                        best_val = value
                        best_groups = (left_group, right_group)

        return best_attr, best_val, best_score, best_groups

    def gini_index(self, left_group, right_group, target_column):
        """
        Menghitung Gini index untuk evaluasi split.
        """
        left_size = len(left_group)
        right_size = len(right_group)
        total_size = left_size + right_size

        left_score = self._gini_score(left_group, target_column)
        right_score = self._gini_score(right_group, target_column)

        gini = (left_size / total_size) * left_score + (right_size / total_size) * right_score
        return gini

    def _gini_score(self, group, target_column):
        """
        Menghitung Gini score untuk sebuah grup.
        """
        size = len(group)
        if size == 0:
            return 0
        class_counts = group[target_column].value_counts()
        proportions = class_counts / size
        return 1 - sum(proportions ** 2)

    def print_tree(self, node=None, depth=0):
        """
        Menampilkan tree secara hierarkis.
        """
        if node is None:
            node = self.tree

        if node['leaf']:
            print(f"{'|  ' * depth}Leaf: {node['value']}")
        else:
            print(f"{'|  ' * depth}[{node['attribute']} <= {node['value']}]")
            self.print_tree(node['left'], depth + 1)
            self.print_tree(node['right'], depth + 1)

    def predict(self, row):
        """
        Membuat prediksi untuk satu instance data dengan decision tree.
        """
        node = self.tree
        while not node['leaf']:
            if row[node['attribute']] <= node['value']:
                node = node['left']
            else:
                node = node['right']
        return node['value']

In [None]:
# Bangun Tree
tree = DecisionTree(max_depth=5, min_size=10, max_features=3)
tree.fit(train_sample, 'workout_type_encoded')
tree.print_tree()

[sleep_hours_normalized <= 0.8583333333333334]
|  [workout_type <= Cycling]
|  |  Leaf: 0
|  |  [active_minutes_normalized <= 0.12439193884642112]
|  |  |  [calories_burned_normalized <= 0.7610200000000001]
|  |  |  |  Leaf: 4
|  |  |  |  Leaf: 3
|  |  |  [calories_per_minute_normalized <= 1.989814072616758e-10]
|  |  |  |  Leaf: 3
|  |  |  |  Leaf: 1
|  [distance_km <= 2.15]
|  |  [workout_type <= Cycling]
|  |  |  Leaf: 0
|  |  |  [calories_per_minute <= 2.0957461001160955]
|  |  |  |  Leaf: 4
|  |  |  |  Leaf: 4
|  |  [calories_per_minute <= 124.12564677714577]
|  |  |  [workout_type <= Cycling]
|  |  |  |  Leaf: 0
|  |  |  |  Leaf: 5
|  |  |  [active_minutes <= 16]
|  |  |  |  Leaf: 4
|  |  |  |  Leaf: 3


# Implementasi Random Forest

Bangun beberapa decision tree dengan data acak (bootstrapping)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error, mean_absolute_error

class RandomForest:
    def __init__(self, n_trees=10, max_depth=5, min_size=5, sample_size=0.8):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_size = min_size
        self.sample_size = sample_size
        self.trees = []

    def _bootstrap_sample(self, data):
        """
        Membuat bootstrap sample dari dataset.
        """
        n_samples = int(len(data) * self.sample_size)
        return data.sample(n=n_samples, replace=True)

    def fit(self, data, target_column):
        """
        Melatih random forest dengan membuat beberapa decision tree.
        """
        self.trees = []
        for i in range(self.n_trees):
            print(f"Building tree {i + 1}/{self.n_trees}...")
            sample = self._bootstrap_sample(data)
            tree = DecisionTree(max_depth=self.max_depth, min_size=self.min_size)
            tree.fit(sample, target_column)
            self.trees.append(tree)

    def predict_single(self, row):
        """
        Membuat prediksi untuk satu data point berdasarkan voting mayoritas dari semua trees.
        """
        predictions = [tree.predict(row) for tree in self.trees]
        return max(set(predictions), key=predictions.count)  # Voting mayoritas

    def predict(self, data):
        """
        Membuat prediksi untuk seluruh dataset.
        """
        return data.apply(self.predict_single, axis=1)

Latih Model

In [None]:
# Latih model random forest
subset_train_data = train_data.sample(n=10000, random_state=42)
X_subset = subset_train_data.drop(columns='workout_type')
y_subset = subset_train_data['workout_type']
rf = RandomForest(n_trees=5, max_depth=5, min_size=10, sample_size=0.8)
rf.fit(subset_train_data, 'workout_type')

Building tree 1/5...
Building tree 2/5...
Building tree 3/5...
Building tree 4/5...
Building tree 5/5...


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Langkah 1: Prediksi pada data test
test_predictions = rf.predict(X_subset)

# Langkah 2: Hitung akurasi
accuracy = accuracy_score(y_subset, test_predictions)
print(f"Akurasi pada Data Test: {accuracy:.4f}")

# Langkah 3: Confusion Matrix
print("\nConfusion Matrix:\n", confusion_matrix(y_subset, test_predictions))

# Langkah 4: Classification Report
print("\nClassification Report:\n", classification_report(y_subset, test_predictions))

# Langkah 5: Prediksi untuk regresi (MSE dan MAE)
mse = mean_squared_error(y_subset, test_predictions)
mae = mean_absolute_error(y_subset, test_predictions)
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")


Akurasi pada Data Test: 1.0000

Confusion Matrix:
 [[2791    0    0    0    0    0]
 [   0 1433    0    0    0    0]
 [   0    0 1482    0    0    0]
 [   0    0    0 1467    0    0]
 [   0    0    0    0 1464    0]
 [   0    0    0    0    0 1363]]

Classification Report:
               precision    recall  f1-score   support

     Cycling       1.00      1.00      1.00      2791
 Gym Workout       1.00      1.00      1.00      1433
     Running       1.00      1.00      1.00      1482
    Swimming       1.00      1.00      1.00      1467
     Walking       1.00      1.00      1.00      1464
        Yoga       1.00      1.00      1.00      1363

    accuracy                           1.00     10000
   macro avg       1.00      1.00      1.00     10000
weighted avg       1.00      1.00      1.00     10000



ValueError: could not convert string to float: 'Cycling'

Simpan

In [None]:
import joblib

# Simpan model
joblib.dump(rf, 'random_forest_model.pkl')
rf_loaded = joblib.load('random_forest_model.pkl')

Simulasi

In [None]:
!pip install scikit-learn --upgrade



In [None]:
def simulate_workout():
    calories_burned = float(input("Berapa kalori yang ingin Anda bakar? "))
    sleep_hours = float(input("Berapa jam Anda tidur semalam? "))
    heart_rate_avg = int(input("Berapa detak jantung Anda sekarang (BPM)? "))
    weather = input("Bagaimana cuaca saat ini? (Clear/Fog/Rain/Snow): ").lower()
    mood = input("Bagaimana mood Anda saat ini? (Happy/Neutral/Stressed/Tired): ").lower()

    # Mapping untuk weather dan mood ke nilai yang sesuai
    weather_map = {'clear': 0, 'fog': 1, 'rain': 2, 'snow': 3}
    mood_map = {'happy': 0, 'neutral': 1, 'stressed': 2, 'tired': 3}

    input_data = {
        "calories_burned": calories_burned,
        "sleep_hours": sleep_hours,
        "heart_rate_avg": heart_rate_avg,
        "weather_conditions_encoded": weather_map.get(weather, -1),  
        "mood_encoded": mood_map.get(mood, -1)  
    }

    # Mengubah input menjadi DataFrame
    input_df = pd.DataFrame([input_data])

    # Menggunakan model untuk prediksi
    predicted_workout = rf.predict(input_df)[0]  # Mengambil hasil prediksi dari Random Forest

    # Output hasil prediksi
    print(f"Rekomendasi workout Anda: {predicted_workout}")

# Simulasi rekomendasi workout
simulate_workout()

Berapa kalori yang ingin Anda bakar? 3709.75
Berapa jam Anda tidur semalam? 8
Berapa detak jantung Anda sekarang (BPM)? 86
Bagaimana cuaca saat ini? (Clear/Fog/Rain/Snow): fog
Bagaimana mood Anda saat ini? (Happy/Neutral/Stressed/Tired): stressed


KeyError: 'workout_type_encoded'

In [None]:
# Cek kolom-kolom dalam dataset
print(subset_train_data.columns)

Index(['steps', 'calories_burned', 'distance_km', 'active_minutes',
       'sleep_hours', 'heart_rate_avg', 'workout_type', 'weather_conditions',
       'location', 'mood', 'calories_per_minute', 'workout_type_encoded',
       'weather_conditions_encoded', 'mood_encoded', 'steps_normalized',
       'calories_burned_normalized', 'distance_km_normalized',
       'active_minutes_normalized', 'sleep_hours_normalized',
       'heart_rate_avg_normalized', 'calories_per_minute_normalized'],
      dtype='object')


In [None]:
# Cek kolom-kolom dalam data pelatihan
print(train_data.columns)

Index(['steps', 'calories_burned', 'distance_km', 'active_minutes',
       'sleep_hours', 'heart_rate_avg', 'workout_type', 'weather_conditions',
       'location', 'mood', 'calories_per_minute', 'workout_type_encoded',
       'weather_conditions_encoded', 'mood_encoded', 'steps_normalized',
       'calories_burned_normalized', 'distance_km_normalized',
       'active_minutes_normalized', 'sleep_hours_normalized',
       'heart_rate_avg_normalized', 'calories_per_minute_normalized'],
      dtype='object')
