In [1]:
!pip install pandas numpy scikit-learn xgboost



In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

In [3]:
# Loading Data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
submission_df = pd.read_csv("sample_submission.csv")

In [4]:
# Encoding Categorical Features
cat_cols = ['string_id', 'error_code', 'installation_type']
encoder_dict = {}

for col in cat_cols:
    encoder = LabelEncoder()
    train_df[col] = encoder.fit_transform(train_df[col])
    test_df[col] = encoder.transform(test_df[col])
    encoder_dict[col] = encoder

In [8]:
# Dropping ID and Target
X = train_df.drop(columns=["id", "efficiency"])
y = train_df["efficiency"]
X_test = test_df.drop(columns=["id"])

# Detecting non-numeric values in numeric columns and fix them
def clean_numeric(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = pd.to_numeric(df[col], errors='coerce')
    return df

X = clean_numeric(X)
X_test = clean_numeric(X_test)

# Handling missing values
X.fillna(X.mean(), inplace=True)
X_test.fillna(X.mean(), inplace=True)

# Rechecking categorical columns
for col in ['string_id', 'error_code', 'installation_type']:
    if col in train_df.columns:
        le = LabelEncoder()
        le.fit(train_df[col].astype(str))
        X[col] = le.transform(train_df[col].astype(str))
        X_test[col] = le.transform(test_df[col].astype(str))


In [9]:
# Train/Test Split for Evaluation
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
#  Model Training

# Random Forest
rf_model = RandomForestRegressor(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_valid)

# XGBoost
xgb_model = XGBRegressor(n_estimators=200, learning_rate=0.1, max_depth=6, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_valid)

In [12]:
# Evaluation Function
def get_score(y_true, y_pred):
    return 100 * (1 - np.sqrt(mean_squared_error(y_true, y_pred)))

print("RF Score:", get_score(y_valid, rf_preds))
print("XGB Score:", get_score(y_valid, xgb_preds))

RF Score: 89.07342713548911
XGB Score: 89.12593182087551


In [13]:
#Final Training on Full Data
rf_model.fit(X, y)
xgb_model.fit(X, y)

In [14]:
# Prediction on Test Set
rf_test_preds = rf_model.predict(X_test)
xgb_test_preds = xgb_model.predict(X_test)

In [15]:
# Ensemble through simple averaging
ensemble_preds = (rf_test_preds + xgb_test_preds) / 2

In [16]:
# Submission
final_submission = pd.DataFrame({
    "id": test_df["id"],
    "efficiency": ensemble_preds
})

final_submission.to_csv("final_submission.csv", index=False)
print("Submission file saved as final_submission.csv")

Submission file saved as final_submission.csv
