In [1]:
%pip install pandas numpy

Collecting pandas
  Downloading https://files.pythonhosted.org/packages/b2/56/f886ed6f1777ffa9d54c6e80231b69db8a1f52dcc33f5967b06a105dcfe0/pandas-1.3.5-cp37-cp37m-win_amd64.whl (10.0MB)
Collecting pytz>=2017.3 (from pandas)
  Downloading https://files.pythonhosted.org/packages/9c/3d/a121f284241f08268b21359bd425f7d4825cffc5ac5cd0e1b3d82ffd2b10/pytz-2024.1-py2.py3-none-any.whl (505kB)
Installing collected packages: pytz, pandas
Successfully installed pandas-1.3.5 pytz-2024.1
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [4]:
import pandas as pd
import numpy as np
import os

# Get the current working directory and set the file path
directory = os.getcwd()
file_path = os.path.join(directory, 'temperature_humidity_data_contoh.csv')

# Load the existing dataset
existing_df = pd.read_csv(file_path)

# Add label column
existing_df['Label'] = ((existing_df['Temperature'] > 37) & (existing_df['Humidity'] > 90)).astype(int)

# Function to generate synthetic data
def generate_synthetic_data(num_rows, label_proportion=0.5):
    num_label_1 = int(num_rows * label_proportion)
    num_label_0 = num_rows - num_label_1

    # Generate data for label 1 (smoking)
    data_label_1 = {
        'Timestamp': pd.date_range(start='2023-01-01', periods=num_label_1, freq='S'),
        'Temperature': np.random.uniform(38, 45, num_label_1),  # Ensure temperature > 37
        'Humidity': np.random.uniform(91, 100, num_label_1)     # Ensure humidity > 90
    }

    # Generate data for label 0 (not smoking)
    data_label_0 = {
        'Timestamp': pd.date_range(start='2023-01-01', periods=num_label_0, freq='S'),
        'Temperature': np.random.uniform(20, 37, num_label_0),  # Ensure temperature <= 37
        'Humidity': np.random.uniform(30, 90, num_label_0)      # Ensure humidity <= 90
    }

    # Combine the data
    data_combined = {
        'Timestamp': np.concatenate([data_label_1['Timestamp'], data_label_0['Timestamp']]),
        'Temperature': np.concatenate([data_label_1['Temperature'], data_label_0['Temperature']]),
        'Humidity': np.concatenate([data_label_1['Humidity'], data_label_0['Humidity']])
    }

    synthetic_df = pd.DataFrame(data_combined)
    synthetic_df['Label'] = [1] * num_label_1 + [0] * num_label_0
    return synthetic_df

# Calculate how many rows we need to add
rows_needed = 10000 - len(existing_df)

# Generate synthetic data
synthetic_df = generate_synthetic_data(rows_needed)

# Combine existing and synthetic data
combined_df = pd.concat([existing_df, synthetic_df], ignore_index=True)

# Split into training and testing sets (80/20 split)
train_df = combined_df.sample(frac=0.8, random_state=42)
test_df = combined_df.drop(train_df.index)

# Save to CSV files
train_file_path = os.path.join(directory, 'train.csv')
test_file_path = os.path.join(directory, 'test.csv')
train_df.to_csv(train_file_path, index=False)
test_df.to_csv(test_file_path, index=False)

print("Data preparation complete. Train and test datasets saved.")


Data preparation complete. Train and test datasets saved.
