In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("satpassion/hvac-dataset")

print("Path to dataset files:", path)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load the data
df = pd.read_excel('/content/hvac.xlsx')

# Drop rows with missing target (Tonnage)
df = df.dropna(subset=['Consolidated  - Tonnage'])

# Select features to use (you can expand this list)
features = ['Description - Cleaned', 'Consolidated - Element Classificaition',
            'Manufacturer', 'Model Number', 'Consolidated - Refrigerant']

# Fill missing values in selected features
df[features] = df[features].fillna('Unknown')

# One-hot encode categorical features
X = pd.get_dummies(df[features])

# Target
y = df['Consolidated  - Tonnage']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate

rmse = np.sqrt(mean_squared_error(y_test, y_pred))

r2 = r2_score(y_test, y_pred)

print("Root Mean Squared Error (RMSE):", rmse)
print("R² Score:", r2)


FileNotFoundError: [Errno 2] No such file or directory: '/content/hvac.xlsx'

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load data
df = pd.read_excel('/content/hvac.xlsx')

# Drop rows with missing target
df = df.dropna(subset=['Consolidated  - Tonnage'])

# Select features (you can adjust or add more)
features = ['Description - Cleaned', 'Consolidated - Element Classificaition',
            'Manufacturer', 'Model Number', 'Consolidated - Refrigerant']
target = 'Consolidated  - Tonnage'

# Fill missing values in features
df[features] = df[features].fillna('Unknown')

# Split dataset
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing for categorical data
categorical_features = features
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Column transformer
preprocessor = ColumnTransformer(
    transformers=[('cat', categorical_transformer, categorical_features)]
)

# Define the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42))
])

# Train model
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Improved RMSE:", rmse)
print("Improved R² Score:", r2)


Improved RMSE: 6.255446866095601
Improved R² Score: 0.7124210057820031


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

# Load and prepare data
df = pd.read_excel('/content/hvac.xlsx')
df = df.dropna(subset=['Consolidated  - Tonnage'])

# Feature engineering
# Handle missing values separately
# Combine useful text columns into one
df['Description Combined'] = df['Description - Cleaned'].astype(str) + " | " + df['System Type'].astype(str)
df['Installation Year'] = pd.to_datetime(df['Installation Date'], errors='coerce').dt.year
df['Installation Year'] = df['Installation Year'].fillna(df['Installation Year'].median())

# Categorical columns: fill missing with 'Unknown'
categorical_cols = ['Description Combined', 'Consolidated - Element Classificaition',
                    'Manufacturer', 'Model Number', 'Consolidated - Refrigerant']
for col in categorical_cols:
    df[col] = df[col].fillna('Unknown')

# Select features
features = ['Description Combined', 'Consolidated - Element Classificaition',
            'Manufacturer', 'Model Number', 'Consolidated - Refrigerant', 'Installation Year']
df[features] = df[features].fillna('Unknown')

# Target
target = 'Consolidated  - Tonnage'
X = df[features]
y = df[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing
categorical_features = ['Description Combined', 'Consolidated - Element Classificaition',
                        'Manufacturer', 'Model Number', 'Consolidated - Refrigerant']
numeric_features = ['Installation Year']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', 'passthrough', numeric_features)
    ])

# Model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(n_estimators=250, max_depth=8, learning_rate=0.1, random_state=42))
])

# Fit model
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("XGBoost RMSE:", rmse)
print("XGBoost R² Score:", r2)


XGBoost RMSE: 5.512657595193009
XGBoost R² Score: 0.7766620539321027


In [None]:
result = predict_xgb(
    description="CHILLER - CH-2",
    desc_cleaned="chiller",
    element_classification="110 - CHILLER - RECIPROCATING",
    manufacturer="CARRIER",
    model_number="30HK050-640",
    refrigerant="R22",
    year=1990
)

print("Predicted Tonnage:", result)


Predicted Tonnage: 47.85
