In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression

# Load the dataset
df = pd.read_csv("dht22_data.csv") 

# Inspect the first few rows
df.head()


Unnamed: 0,Timestamp,Temperature,Humidity
0,20250207215808,21.9,69.2
1,20250207215828,21.7,69.3
2,20250207215848,21.5,68.7
3,20250207215908,21.4,67.4
4,20250207215928,21.3,68.3


In [54]:
print(df.columns)


Index(['Timestamp', 'Temperature', 'Humidity'], dtype='object')


In [55]:
# Extract independent (X) and dependent (y) variables
X = df[['Temperature']].values  # Reshape needed for sklearn
y = df['Humidity'].values

# Create and train the model
model = LinearRegression()
model.fit(X, y)

# Print model parameters
print(f"Intercept: {model.intercept_}, Coefficient: {model.coef_[0]}")


Intercept: 115.71734940872277, Coefficient: -2.4075095968007396


In [56]:
# Find min and max temperature
min_temp = df['Temperature'].min()
max_temp = df['Temperature'].max()

# Generate 100 equally spaced test temperature values
test_temps = np.linspace(min_temp, max_temp, 100).reshape(-1, 1)

# Predict humidity for test temperature values
predicted_humidity = model.predict(test_temps)

# Print everything
print(f"Min temperature: {min_temp}, Max temperature: {max_temp}")
print(f"Test temperatures: {test_temps}")
print(f"Predicted humidity: {predicted_humidity}")


Min temperature: 19.2, Max temperature: 31.7
Test temperatures: [[19.2       ]
 [19.32626263]
 [19.45252525]
 [19.57878788]
 [19.70505051]
 [19.83131313]
 [19.95757576]
 [20.08383838]
 [20.21010101]
 [20.33636364]
 [20.46262626]
 [20.58888889]
 [20.71515152]
 [20.84141414]
 [20.96767677]
 [21.09393939]
 [21.22020202]
 [21.34646465]
 [21.47272727]
 [21.5989899 ]
 [21.72525253]
 [21.85151515]
 [21.97777778]
 [22.1040404 ]
 [22.23030303]
 [22.35656566]
 [22.48282828]
 [22.60909091]
 [22.73535354]
 [22.86161616]
 [22.98787879]
 [23.11414141]
 [23.24040404]
 [23.36666667]
 [23.49292929]
 [23.61919192]
 [23.74545455]
 [23.87171717]
 [23.9979798 ]
 [24.12424242]
 [24.25050505]
 [24.37676768]
 [24.5030303 ]
 [24.62929293]
 [24.75555556]
 [24.88181818]
 [25.00808081]
 [25.13434343]
 [25.26060606]
 [25.38686869]
 [25.51313131]
 [25.63939394]
 [25.76565657]
 [25.89191919]
 [26.01818182]
 [26.14444444]
 [26.27070707]
 [26.3969697 ]
 [26.52323232]
 [26.64949495]
 [26.77575758]
 [26.9020202 ]
 [27.0

In [57]:
# Create scatter plot with regression line using Plotly
fig = px.scatter(df, x='Temperature', y='Humidity', opacity=0.65, labels={'x': "Temperature", 'y': "Humidity"})

# Add trend line
fig.add_traces(go.Scatter(x=test_temps.flatten(), y=predicted_humidity, mode='lines', name='Regression Fit', line=dict(color='red')))

# Show the plot
fig.show()

In [58]:
# Define threshold temperature values to remove extreme outliers
temp_lower_bound = df['Temperature'].quantile(0.10)  # Bottom 10% threshold
temp_upper_bound = df['Temperature'].quantile(0.90)  # Top 10% threshold

# Filter the data
filtered_df = df[(df['Temperature'] >= temp_lower_bound) & (df['Temperature'] <= temp_upper_bound)]

# Check new data range
print(f"Filtered Data Range: {filtered_df['Temperature'].min()} to {filtered_df['Temperature'].max()}")


Filtered Data Range: 21.3 to 30.8


In [59]:
# Extract independent (X) and dependent (y) variables from filtered data
X_filtered = filtered_df[['Temperature']].values
y_filtered = filtered_df['Humidity'].values

# Create and train new model
filtered_model = LinearRegression()
filtered_model.fit(X_filtered, y_filtered)

# Generate new test temperature values and predictions
filtered_predicted_humidity = filtered_model.predict(test_temps)

# Print new model parameters
print(f"New Intercept: {filtered_model.intercept_}, New Coefficient: {filtered_model.coef_[0]}")


New Intercept: 118.01217732426146, New Coefficient: -2.5003959542227228


In [60]:
# Create scatter plot for filtered data
fig = px.scatter(filtered_df, x='Temperature', y='Humidity', opacity=0.65, labels={'x': "Temperature", 'y': "Humidity"}, title="Filtered Temperature vs Humidity with Trend Line")

# Add new trend line
fig.add_traces(go.Scatter(x=test_temps.flatten(), y=filtered_predicted_humidity, mode='lines', name='New Trend Line', line=dict(color='red')))

# Show the interactive plot
fig.show()

In [61]:
# Create scatter plot for original data
fig = px.scatter(df, x='Temperature', y='Humidity', opacity=0.3, color_discrete_sequence=['blue'], labels={'x': "Temperature", 'y': "Humidity"}, title="Comparison of Trend Lines Before and After Filtering")
fig.add_traces(go.Scatter(x=test_temps.flatten(), y=predicted_humidity, mode='lines', name='Original Trend Line', line=dict(color='red', dash='dash')))

# Add scatter plot for filtered data
fig.add_traces(go.Scatter(x=filtered_df['Temperature'], y=filtered_df['Humidity'], mode='markers', marker=dict(color='green', opacity=0.5), name='Filtered Data'))

# Add filtered trend line
fig.add_traces(go.Scatter(x=test_temps.flatten(), y=filtered_predicted_humidity, mode='lines', name='Filtered Trend Line', line=dict(color='black')))

# Show the interactive plot
fig.show()

In [62]:
# Check if the trend line follows the original data points
residuals = y - model.predict(X)
outlier_count = np.sum(np.abs(residuals) > np.std(residuals) * 2)  # Count outliers beyond 2 standard deviations

print(f"Number of outliers detected: {outlier_count}")
print(f"Does the trend line follow the data? {'Yes' if outlier_count < len(y) * 0.1 else 'Some deviations exist'}")


Number of outliers detected: 7
Does the trend line follow the data? Yes


In [63]:
# Define stricter threshold for extreme outliers
temp_lower_bound_2 = filtered_df['Temperature'].quantile(0.05)  # Bottom 5% threshold
temp_upper_bound_2 = filtered_df['Temperature'].quantile(0.95)  # Top 5% threshold

# Further filter the data
filtered_df_2 = filtered_df[(filtered_df['Temperature'] >= temp_lower_bound_2) & (filtered_df['Temperature'] <= temp_upper_bound_2)]

# Extract independent (X) and dependent (y) variables from second filtered dataset
X_filtered_2 = filtered_df_2[['Temperature']].values
y_filtered_2 = filtered_df_2['Humidity'].values

# Train a new model on the further filtered data
filtered_model_2 = LinearRegression()
filtered_model_2.fit(X_filtered_2, y_filtered_2)

# Generate new predictions
filtered_predicted_humidity_2 = filtered_model_2.predict(test_temps)

# Print new model parameters
print(f"Final Model Intercept: {filtered_model_2.intercept_}, Final Coefficient: {filtered_model_2.coef_[0]}")


Final Model Intercept: 116.32771510741215, Final Coefficient: -2.44293512168533


In [64]:
# Create scatter plot for second filtered data
fig = px.scatter(filtered_df_2, x='Temperature', y='Humidity', opacity=0.65, labels={'x': "Temperature", 'y': "Humidity"}, title="Second Filtered Temperature vs Humidity with Trend Line")

# Add new trend line for second filtered data
fig.add_traces(go.Scatter(x=test_temps.flatten(), y=filtered_predicted_humidity_2, mode='lines', name='Final Trend Line', line=dict(color='green')))

# Show the interactive plot
fig.show()


In [65]:
# Create scatter plot for comparison
fig = px.scatter(df, x='Temperature', y='Humidity', opacity=0.3, color_discrete_sequence=['blue'], labels={'x': "Temperature", 'y': "Humidity"}, title="Comparison of Trend Lines Across Filtering Stages")

# Original trend line
fig.add_traces(go.Scatter(x=test_temps.flatten(), y=predicted_humidity, mode='lines', name='Original Trend Line', line=dict(color='red', dash='dash')))

# First filtered trend line
fig.add_traces(go.Scatter(x=test_temps.flatten(), y=filtered_predicted_humidity, mode='lines', name='Filtered Trend Line (10-90%)', line=dict(color='black')))

# Second filtered trend line (stricter filtering)
fig.add_traces(go.Scatter(x=test_temps.flatten(), y=filtered_predicted_humidity_2, mode='lines', name='Filtered Trend Line (5-95%)', line=dict(color='green')))

# Show the final comparison plot
fig.show()


In [66]:
# Check if the final trend line follows the second filtered data points
residuals_final = y_filtered_2 - filtered_model_2.predict(X_filtered_2)
final_outlier_count = np.sum(np.abs(residuals_final) > np.std(residuals_final) * 2)  # Outliers beyond 2 std deviations

print(f"Final Number of Outliers Detected: {final_outlier_count}")
print(f"Does the final trend line follow the data? {'Yes' if final_outlier_count < len(y_filtered_2) * 0.1 else 'Some deviations exist'}")

# Compare slope changes across models
print(f"Original Model Slope: {model.coef_[0]}")
print(f"First Filtered Model Slope: {filtered_model.coef_[0]}")
print(f"Final Model Slope (Stricter Filtering): {filtered_model_2.coef_[0]}")

# Analyze if learning pattern changed
if abs(filtered_model_2.coef_[0] - model.coef_[0]) > 0.1:  # Adjust threshold if needed
    print("The model learned a different pattern after stricter filtering.")
else:
    print("The trend remained mostly the same after stricter filtering.")


Final Number of Outliers Detected: 5
Does the final trend line follow the data? Yes
Original Model Slope: -2.4075095968007396
First Filtered Model Slope: -2.5003959542227228
Final Model Slope (Stricter Filtering): -2.44293512168533
The trend remained mostly the same after stricter filtering.
