# Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

# Load the Data

In [None]:
# Load the training and test datasets
train_data = pd.read_csv('train.csv')  # Replace with your training data file path
test_data = pd.read_csv('test.csv')    # Replace with your test data file path

# Display the first few rows of the training data
train_data.head()

# Handling missing values

In [None]:
print("Missing values in training data:")
print(train_data.isnull().sum())

print("\nMissing values in test data:")
print(test_data.isnull().sum())

In [None]:
# Fill missing values in training data
for column in ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']:
    train_data[column].fillna(train_data[column].mode()[0], inplace=True)  # Fill with mode for categorical columns

#train_data['Weight Capacity (kg)'].fillna(train_data['Weight Capacity (kg)'].mean(), inplace=True)  # Fill with mean for numerical column

# Fill missing values in test data
for column in ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']:
    test_data[column].fillna(test_data[column].mode()[0], inplace=True)  # Fill with mode for categorical columns

#test_data['Weight Capacity (kg)'].fillna(test_data['Weight Capacity (kg)'].mean(), inplace=True)  # Fill with mean for numerical column

# Check for missing values again
print("Missing values in training data after filling:")
print(train_data.isnull().sum())

print("\nMissing values in test data after filling:")
print(test_data.isnull().sum())

# before encoding 

In [None]:
print("Training Data Columns:")
print(train_data.columns)

print("\nTest Data Columns:")
print(test_data.columns)

In [None]:
# Standardize column names
train_data.columns = train_data.columns.str.lower().str.strip()
test_data.columns = test_data.columns.str.lower().str.strip()

# Encoding Categorical Variables

In [None]:
# Encode categorical variables
label_encoders = {}
for column in ['brand', 'material', 'size', 'compartments', 'laptop compartment', 'waterproof', 'style', 'color']:
    le = LabelEncoder()
    train_data[column] = le.fit_transform(train_data[column])
    test_data[column] = le.transform(test_data[column])
    label_encoders[column] = le

# Splitting the dataset

In [None]:
# Define features and target variable
X = train_data.drop(columns=['id', 'price'])
y = train_data['price']

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# training the model

In [None]:
# Initialize and train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = model.predict(X_val)

# Evaluate the model
mse = mean_squared_error(y_val, y_pred)
print(f'Mean Squared Error: {mse}')

# hyperparameter

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)

# Train the Model with Best Parameters

In [None]:
# Create a new RandomForestRegressor with the best parameters
best_model = RandomForestRegressor(
    n_estimators=grid_search.best_params_['n_estimators'],
    max_depth=grid_search.best_params_['max_depth'],
    min_samples_split=grid_search.best_params_['min_samples_split'],
    random_state=42
)

# Fit the model to the training data
best_model.fit(X_train, y_train)

#  Evaluate the Model

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
# Assuming `best_model` is the model obtained after hyperparameter tuning
y_val_pred = best_model.predict(X_val)

# Calculate evaluation metrics
mae = mean_absolute_error(y_val, y_val_pred)
mse = mean_squared_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)

# Print the results
print(f'Mean Absolute Error (MAE): {mae:.2f}')
print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'R-squared (R²): {r2:.2f}')

# Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
print("Cross-validated MSE: ", -scores.mean())

In [None]:
# Get feature importances
importances = best_model.feature_importances_
features = X.columns

# Create a DataFrame for visualization
importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importances')
plt.show()

# visualization

In [None]:
# Visualize the feature importances
feature_importances = model.feature_importances_
features = X.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importances')
plt.show()

# Confusion Matrix

In [None]:
# Define price bins
bins = [0, 50, 100, 150, 200]  # Adjust these values based on your data
labels = ['Low', 'Medium', 'High', 'Very High']

# Create a new column in the training and validation sets for the price category
train_data['price_category'] = pd.cut(train_data['price'], bins=bins, labels=labels)
y_val_category = pd.cut(y_val, bins=bins, labels=labels)

In [None]:
# Make predictions on the validation set
y_pred = model.predict(X_val)

# Convert predictions to categories
y_pred_category = pd.cut(y_pred, bins=bins, labels=labels)

In [None]:
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_val_category, y_pred_category)

# Calculate F1 score
f1 = f1_score(y_val_category, y_pred_category, average='weighted')

# Calculate accuracy
accuracy = accuracy_score(y_val_category, y_pred_category)

# Print the results
print("Confusion Matrix:")
print(conf_matrix)
print(f"F1 Score: {f1:.2f}")
print(f"Accuracy: {accuracy:.2f}")

In [None]:
# Make predictions on the test set
test_predictions = model.predict(test_data.drop(columns=['id']))

# Create a DataFrame for the predictions
predictions_df = pd.DataFrame({
    'id': test_data['id'],
    'predicted_price': test_predictions
})

# Save predictions to a CSV file
predictions_df.to_csv('predicted_prices.csv', index=False)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

In [None]:
# Check the data types of features and target variable
print(X.dtypes)
print(y.dtypes)

# If your target variable is categorical, convert it to numeric if necessary
# For example, if y is categorical, you can use label encoding
from sklearn.preprocessing import LabelEncoder

# If y is categorical, encode it
if y.dtype == 'object':
    le = LabelEncoder()
    y = le.fit_transform(y)

# If X contains categorical features, encode them
X = pd.get_dummies(X, drop_first=True)  # One-hot encoding for categorical features

# Load and Preprocess Data

In [None]:
# Load your dataset
# train_data = pd.read_csv('train_data.csv')  # Uncomment and replace with your actual data path

# Assuming 'price' is the target variable and the rest are features
X = train_data.drop(columns=['id', 'price'])
y = train_data['price']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Decision Tree Regressor

In [None]:
# Assuming 'price' is the actual target variable you want to predict
y = train_data['price']  # Use the actual price column for regression

# Ensure that X contains only the feature columns
X = train_data.drop(columns=['id', 'price', 'price_category'])  # Drop the target and any non-feature columns

# Check the data types again
print(X.dtypes)
print(y.dtypes)

# If X contains categorical features, encode them
X = pd.get_dummies(X, drop_first=True)  # One-hot encoding for categorical features

# Now split the data into training and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Now you can fit the Decision Tree model
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Initialize and train the Decision Tree Regressor
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

# Make predictions and evaluate
dt_y_pred = dt_model.predict(X_test)
print("Decision Tree Regressor:")
print(f"MAE: {mean_absolute_error(y_test, dt_y_pred):.2f}")
print(f"MSE: {mean_squared_error(y_test, dt_y_pred):.2f}")
print(f"R²: {r2_score(y_test, dt_y_pred):.2f}")

# Gradient Boosting Regressor

In [None]:
from sklearn.impute import SimpleImputer

# Create an imputer for numerical features
imputer = SimpleImputer(strategy='mean')  # You can also use 'median' or 'most_frequent'

# Fit the imputer on the training data and transform both training and test data
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Convert back to DataFrame if needed
X_train = pd.DataFrame(X_train_imputed, columns=X.columns)
X_test = pd.DataFrame(X_test_imputed, columns=X.columns)

In [None]:
# Initialize and train the Gradient Boosting Regressor
gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train, y_train)

# Make predictions and evaluate
gb_y_pred = gb_model.predict(X_test)
print("\nGradient Boosting Regressor:")
print(f"MAE: {mean_absolute_error(y_test, gb_y_pred):.2f}")
print(f"MSE: {mean_squared_error(y_test, gb_y_pred):.2f}")
print(f"R²: {r2_score(y_test, gb_y_pred):.2f}")

# Support Vector Regression

In [None]:
# Initialize and train the Support Vector Regressor
svr_model = SVR(kernel='rbf')  # You can also try 'linear' or 'poly'
svr_model.fit(X_train, y_train)

# Make predictions and evaluate
svr_y_pred = svr_model.predict(X_test)
print("\nSupport Vector Regression:")
print(f"MAE: {mean_absolute_error(y_test, svr_y_pred):.2f}")
print(f"MSE: {mean_squared_error(y_test, svr_y_pred):.2f}")
print(f"R²: {r2_score(y_test, svr_y_pred):.2f}")

# Hyperparameter Tuning

Decision Trees

In [None]:
dt_param_grid = {
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10]
}

dt_grid_search = GridSearchCV(DecisionTreeRegressor(random_state=42), dt_param_grid, cv=5, scoring='neg_mean_squared_error')
dt_grid_search.fit(X_train, y_train)

print("\nBest parameters for Decision Tree:", dt_grid_search.best_params_)

Gradient Boosting Hyperparameter Tuning

In [None]:
gb_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 10],
    'learning_rate': [0.01, 0.1, 0.2]
}

gb_grid_search = GridSearchCV(GradientBoostingRegressor(random_state=42), gb_param_grid, cv=5, scoring='neg_mean_squared_error')
gb_grid_search.fit(X_train, y_train)

print("Best parameters for Gradient Boosting:", gb_grid_search.best_params_)

Support Vector Regression Hyperparameter Tuning

In [None]:
svr_param_grid = {
    'C': [0.1, 1]
}

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# Define the parameter grid for RandomizedSearchCV
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 10, None],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'min_samples_split': [2, 5, 10]
}

In [None]:
# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    gb_model,
    param_distributions=param_dist,
    n_iter=100,  # Number of parameter settings to sample
    cv=5,  # Number of cross-validation folds
    scoring='neg_mean_squared_error',  # Use negative MSE for scoring
    random_state=42,
    n_jobs=-1  # Use all available cores
)

In [None]:
# Fit the model using RandomizedSearchCV
random_search.fit(X_train, y_train)


In [None]:
# Get the best model from the random search
best_model = random_search.best_estimator_


In [None]:
# Evaluate the model on the validation set
y_val_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_val_pred)
mse = mean_squared_error(y_test, y_val_pred)
r2 = r2_score(y_test, y_val_pred)

In [None]:
print("Best parameters found: ", random_search.best_params_)
print("Evaluation on Test Set:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R²): {r2:.2f}")

In [None]:
# Make predictions on the test set (if you have a separate test set)
# Assuming you have a separate test set named 'test_data'
test_data = pd.read_csv('test.csv')  # Uncomment and replace with your actual test data path
X_test_final = test_data.drop(columns=['id'])  # Adjust based on your test dataset

In [None]:
# Assuming you have already preprocessed your training data
# For example, if you used one-hot encoding for categorical features in X_train:
X_train = pd.get_dummies(X_train, drop_first=True)
X_test_final = pd.get_dummies(X_test_final, drop_first=True)

# Align the columns of X_test_final with X_train
X_test_final = X_test_final.reindex(columns=X_train.columns, fill_value=0)

# Now you can make predictions
y_test_pred = best_model.predict(X_test_final)

# Save predictions to a CSV file
sample_submission = pd.DataFrame({'id': test_data['id'], 'price': y_test_pred})  # Adjust based on your test data
sample_submission.to_csv('sample_submission.csv', index=False)

In [None]:
# Save the best model
joblib.dump(best_model, 'best_gradient_boosting_model.pkl')