# Homework Assignment
 02.06.2025, Marcus Teichtmeister, Lucas Fink

## 1) Data Ingestion

In [128]:
import os, time, pandas as pd,random
from datetime import datetime, timedelta

# Output directory
output_dir = "data"
os.makedirs(output_dir, exist_ok=True)

# Define the headers for the CSV files
header_fuel_level= ['sensor_id', 'timestamp', 'fuel_level']
header_engine_temperature = ['sensor_id', 'timestamp', 'engine_temperature']
header_speed = ['sensor_id', 'timestamp', 'speed']

#### Intitial Values

In [129]:
speed = 0.0                 # Initial speed in km/h
engine_temperature = 20     # Initial temperature in Celsius
fuel_level = 50             # Initial fuel level in liters
timestamp = datetime.now()  # Initial timestamp

speed_targets = [0, 30, 50, 70, 100, 50, 0]         # target speeds in km/h
phase_duration = round(random.uniform(20, 100), 2)  # seconds per phase
current_phase = 0
phase_time = 0

#### Functions for generating data

In [None]:
def generate_engine_temperature_sensor_data(sensor_id=1):
    global engine_temperature
    if speed > 0:
        engine_temperature += round(random.uniform(1, 1.5), 2)
    if engine_temperature > 90:
        engine_temperature = 90  # Cap the temperature at 90 degrees Celsius
        
    if speed == 0:
        engine_temperature -= round(random.uniform(3.5, 5.0), 2) if engine_temperature > 20 else 0  # Cool down when stopped
        
    return {
        "sensor_id": sensor_id,
        "timestamp": timestamp.isoformat(),
        "engine_temperature": engine_temperature if random.random() > 0.2 else None  # Simulate occasional sensor failure,
    }

def generate_speed_sensor_data(sensor_id=2):
    global speed, current_phase, phase_time, timestamp


    # Change phase when duration exceeded
    if phase_time >= phase_duration:
        current_phase = (current_phase + 1) % len(speed_targets)
        phase_time = 0
    phase_time += 1
    target_speed = speed_targets[current_phase]

    # Simulate realistic change toward target speed
    if speed < target_speed:
        speed += random.uniform(2.5, 4.5)  # acceleration
    elif speed > target_speed:
        speed -= random.uniform(1.0, 4.0)  # braking
    speed = max(0, min(speed, 130))  # limit between 0–130 km/h

    # Update timestamp
    timestamp += timedelta(seconds=1)

    if current_phase == 0:
        current_phase = 1  # Ensure we start with a valid phase
        
    return {
        "sensor_id": sensor_id,
        "timestamp": timestamp.isoformat(),
        "speed": round(speed, 2)
    }

def generate_fuel_level_sensor_data(sensor_id=3):
    global fuel_level
    fuel_level -= round(random.uniform(0, 0.002), 5)
    if fuel_level < 0:
        fuel_level = 0

    return {
        "sensor_id": sensor_id,
        "timestamp": timestamp.isoformat(),
        "fuel_level": fuel_level,
    }

In [131]:
# Save data as CSV
def save_data_csv(filename, data):
    filepath = os.path.join(output_dir, filename)
    df = pd.DataFrame(data)
    df.to_csv(filepath, mode='w', index=False, header=True)  
    print(f"Saved sensor data as CSV: {filepath}")

In [132]:
num_iterations = int(input("Wie viele Sekunden sollen simuliert werden? "))

# collect sensor data
engine_temperature_sensor_data = []
speed_sensor_data = []
fuel_level_sensor_data = []

for i in range(num_iterations):
    engine_temperature_data = generate_engine_temperature_sensor_data()
    engine_temperature_sensor_data.append(engine_temperature_data)
    
    speed_data = generate_speed_sensor_data()
    speed_sensor_data.append(speed_data)
    
    fuel_level_data = generate_fuel_level_sensor_data()
    fuel_level_sensor_data.append(fuel_level_data)
    
    # simulate duplicate data for engine temperature every 3 iterations
    if i % 3 == 0 and i != 0:
        engine_temperature_sensor_data.append(engine_temperature_data)

# create DataFrames
df_engine = pd.DataFrame(engine_temperature_sensor_data)
df_speed = pd.DataFrame(speed_sensor_data)
df_fuel = pd.DataFrame(fuel_level_sensor_data)

# CSV-Dateien speichern
save_data_csv("engine_temperature_data.csv", df_engine)
save_data_csv("speed_data.csv", df_speed)
save_data_csv("fuel_level_data.csv", df_fuel)


Saved sensor data as CSV: data\engine_temperature_data.csv
Saved sensor data as CSV: data\speed_data.csv
Saved sensor data as CSV: data\fuel_level_data.csv


## 2) Data Transformation

#### Romove duplicated Rows

In [133]:
df_engine = df_engine.drop_duplicates()
df_speed = df_speed.drop_duplicates()
df_fuel = df_fuel.drop_duplicates()

#### Fill missing Values in the data

In [134]:
def impute_missing_values(df):
    # Fill missing values for engine temperature with the mean, rounded to 2 decimal places
    df['engine_temperature'] = df['engine_temperature'].fillna(df['engine_temperature'].mean()).round(2)
    return df

#### save the transformed data

In [135]:
df_engine = impute_missing_values(df_engine)
save_data_csv("engine_temperature_data.csv", df_engine)

Saved sensor data as CSV: data\engine_temperature_data.csv


# 3) Predictive Modeling

#### Merge speed and fuel level into one dataframe

In [136]:
df_speed = pd.read_csv("data/speed_data.csv")
df_fuel = pd.read_csv("data/fuel_level_data.csv")

# Convert timestamps
df_speed['timestamp'] = pd.to_datetime(df_speed['timestamp'])
df_fuel['timestamp'] = pd.to_datetime(df_fuel['timestamp'])

# Merge on nearest timestamp
df = pd.merge_asof(df_fuel.sort_values('timestamp'),
                   df_speed.sort_values('timestamp'),
                   on='timestamp', direction='nearest')

df = df[['timestamp', 'speed', 'fuel_level']]


#### calculate distance traveled

In [137]:
# Distance per second = (km/h) * (1/3600)
df['distance_delta'] = df['speed'] * (1 / 3600)  # km per second
df['cumulative_distance'] = df['distance_delta'].cumsum()  # total km since start


#### train the model

In [138]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

X = df[['cumulative_distance']]
y = df['fuel_level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")


Mean Squared Error: 0.0007


#### make a prediction

In [None]:
distance_km = 100

# Predict using the trained model
input_data = pd.DataFrame({'cumulative_distance': [distance_km]})
predicted_fuel = model.predict(input_data)[0]
print(f"Predicted fuel level after {distance_km} km: {predicted_fuel:.2f} liters")


Predicted fuel level after 100 km: 43.49 liters
