In [31]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

# Load data
train_data = pd.read_csv("/Users/dhanalakshmijothi/Desktop/Kaggle/russian-car-plates-prices-prediction/train.csv")
test_data = pd.read_csv("/Users/dhanalakshmijothi/Desktop/Kaggle/russian-car-plates-prices-prediction/test.csv")

# Data preprocessing
train_data['region_code'] = train_data['plate'].str[6:]
train_data['region_code_letters'] = train_data['plate'].str[:3]
test_data['region_code'] = test_data['plate'].str[6:]
test_data['region_code_letters'] = test_data['plate'].str[:3]

# Label encoding for region_code and region_code_letters
label_encoder = LabelEncoder()
train_data['region_code'] = label_encoder.fit_transform(train_data['region_code'])
test_data['region_code'] = label_encoder.transform(test_data['region_code'])
train_data['region_code_letters'] = label_encoder.fit_transform(train_data['region_code_letters'])
test_data['region_code_letters'] = label_encoder.transform(test_data['region_code_letters'])

# Log transformation for target variable
train_data['log_price'] = np.log1p(train_data['price'])

# Convert date to ordinal
train_data['date'] = pd.to_datetime(train_data['date'])
test_data['date'] = pd.to_datetime(test_data['date'])

# Extract useful features from the 'date' column
train_data['year'] = train_data['date'].dt.year
train_data['month'] = train_data['date'].dt.month
train_data['day'] = train_data['date'].dt.day
train_data['day_of_week'] = train_data['date'].dt.dayofweek  # 0=Monday, 6=Sunday
train_data['is_weekend'] = (train_data['day_of_week'] >= 5).astype(int)  # 1 if Saturday/Sunday, else 0
train_data['day_of_year'] = train_data['date'].dt.dayofyear
train_data['week_of_year'] = train_data['date'].dt.isocalendar().week

test_data['year'] = test_data['date'].dt.year
test_data['month'] = test_data['date'].dt.month
test_data['day'] = test_data['date'].dt.day
test_data['day_of_week'] = test_data['date'].dt.dayofweek
test_data['is_weekend'] = (test_data['day_of_week'] >= 5).astype(int)
test_data['day_of_year'] = test_data['date'].dt.dayofyear
test_data['week_of_year'] = test_data['date'].dt.isocalendar().week

# Prepare features and target variable
X = train_data.drop(columns=['id', 'plate', 'price', 'log_price', 'date'])
y = train_data['log_price']

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Base model: XGBoost
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=200, max_depth=4)
xgb_model.fit(X_train, y_train)

# Make predictions from the base model
xgb_pred_train = xgb_model.predict(X_train)
xgb_pred_valid = xgb_model.predict(X_valid)

# Train the meta-model (Linear Regression) using the base model's predictions
meta_model = LinearRegression()
meta_model.fit(xgb_pred_train.reshape(-1, 1), y_train)  # Training Linear Regression with base model predictions

# Make final predictions using the meta-model (Linear Regression)
final_predictions = meta_model.predict(xgb_pred_valid.reshape(-1, 1))

# Calculate SMAPE for validation set
def smape(y_true, y_pred):
    numerator = np.abs(y_true - y_pred)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return 100 * np.mean(numerator / denominator)

# Calculate SMAPE on validation predictions
smape_score = smape(y_valid, final_predictions)
print(f'Validation SMAPE: {smape_score}')

# Now make predictions on the test set
xgb_pred_test = xgb_model.predict(test_data.drop(columns=['id', 'plate', 'price', 'date']))
final_test_predictions = meta_model.predict(xgb_pred_test.reshape(-1, 1))

# Convert back from log transformation
final_test_predictions = np.expm1(final_test_predictions)

# Prepare submission
submission = pd.DataFrame({
    'id': test_data['id'],
    'price': final_test_predictions
})

submission.to_csv('/Users/dhanalakshmijothi/Desktop/Kaggle/russian-car-plates-prices-prediction/submission.csv', index=False)

print('Submission file created successfully.')



Validation SMAPE: 5.44202620915723
Submission file created successfully.
