In [None]:
#All the libralies used in this project

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn import metrics

In [None]:
#Loading Data
df=pd.read_csv('/kaggle/input/diamond-price-prediciton-2024/train.csv')
test=pd.read_csv('/kaggle/input/diamond-price-prediciton-2024/test.csv')
df.head()

In [None]:
df.tail()

In [None]:
print('df_shape :',df.shape)
print('test_shape :',test.shape)

In [None]:
#Data Preprocessing
df.info()

In [None]:
df.isna().sum()
#There are no missing values.

In [None]:
df.describe()

In [None]:
columns_to_plot = ['carat', 'depth', 'table', 'x', 'y', 'z']
num_columns = len(columns_to_plot)
columns_per_row = 3
num_rows = (num_columns + columns_per_row - 1) // columns_per_row
fig, axes = plt.subplots(num_rows, columns_per_row, figsize=(columns_per_row * 6, num_rows * 8))
for i, column in enumerate(columns_to_plot):
    ax = axes[i // columns_per_row, i % columns_per_row]
    sns.scatterplot(data=df, x=column, y='price', hue='cut', palette='viridis', ax=ax)
    ax.set_title(f'Scatter Plot of {column} vs Price')
    ax.set_xlabel(column)
    ax.set_ylabel('Price')
for j in range(i + 1, num_rows * columns_per_row):
    fig.delaxes(axes[j // columns_per_row, j % columns_per_row])
plt.tight_layout()
plt.show()

In [None]:
#Check rows that contain zero values in 'x', 'y' ,'z' columns
df[(df['x'] == 0) | (df['y'] == 0) | (df['z'] == 0)]

In [None]:
df = df.drop(df [ df["x"] == 0].index)
df = df.drop(df [ df["y"] == 0].index)
df = df.drop(df[ df["z"] == 0].index)
df.shape

In [None]:
#Check the values after removing the zero values
columns_without_zero = [ 'x', 'y', 'z']
num_columns = len(columns_without_zero)
fig, axes = plt.subplots(1, num_columns, figsize=(num_columns * 5, 6))
for i, column in enumerate(columns_without_zero):
    ax = axes[i]
    sns.scatterplot(data=df, x=column, y='price', hue='cut', palette='viridis', ax=ax)
    ax.set_title(f'Scatter Plot of {column} vs Price')
    ax.set_xlabel(column)
    ax.set_ylabel('Price')
plt.tight_layout()
plt.show()

In [None]:
#Creating a histogram for the distribution of prices in the df data.
plt.figure(figsize=(10, 6))
sns.histplot(df['price'], bins=30, kde=True)
plt.title('Distribution of Price')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
sns.boxplot(x='cut', y='price', data=df, hue ='cut')
plt.title('Price Distribution by Cut Quality')
plt.xlabel('Cut')
plt.ylabel('Price')
plt.show()

In [None]:
sns.pairplot(df[['carat', 'depth', 'table', 'price']])
plt.show()

In [None]:
df = df[(df["x"]<30)]
df= df[(df["y"]<30)]
df = df[(df["z"]<30)&(df["z"]>2)]
df.shape

In [None]:
columns_to_merge = ['cut', 'color', 'clarity']
df['combined_column'] = df[columns_to_merge].apply(lambda x: '_'.join(x), axis=1)
df['volume'] = df['x'] * df['y'] * df['z']
df.head()

In [None]:
# Get list of categorical variables
s = (df.dtypes =="object")
object_cols = list(s[s].index)
print("Categorical variables:")
print(object_cols)

In [None]:
print(df['cut'].unique())
print(df['cut'].value_counts())
print(df['color'].unique())
print(df['color'].value_counts())
print(df['clarity'].unique())
print(df['clarity'].value_counts())
print(df['combined_column'].unique())
print(df['combined_column'].value_counts())

In [None]:
#Converting categorical variables to numeric values.
from sklearn.preprocessing import LabelEncoder
# Initialize label encoders
cut_encoder = LabelEncoder()
color_encoder = LabelEncoder()
clarity_encoder = LabelEncoder()
combined_column_encoder = LabelEncoder()
# Fit and transform the columns
df['cut'] = cut_encoder.fit_transform(df['cut'])
df['color'] = color_encoder.fit_transform(df['color'])
df['clarity'] = clarity_encoder.fit_transform(df['clarity'])
df['combined_column'] = combined_column_encoder.fit_transform(df['combined_column'])
# Create and display mappings
cut_mapping = dict(zip(cut_encoder.classes_, cut_encoder.transform(cut_encoder.classes_)))
color_mapping = dict(zip(color_encoder.classes_, color_encoder.transform(color_encoder.classes_)))
clarity_mapping = dict(zip(clarity_encoder.classes_, clarity_encoder.transform(clarity_encoder.classes_)))
combined_column_mapping = dict(zip(combined_column_encoder.classes_, combined_column_encoder.transform(combined_column_encoder.classes_)))
print("Cut mapping:", cut_mapping)
print("Color mapping:", color_mapping)
print("Clarity mapping:", clarity_mapping)
print("combined_column mapping:",combined_column_mapping)


In [None]:
df.corr()

In [None]:
corr_matrix = df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True, cbar_kws={"shrink": .8})
plt.title('Heatmap of Correlation Matrix')
plt.show()

In [None]:
df.drop(['depth', 'table','Id'], axis=1, inplace=True)
df.shape

In [None]:
from math import sqrt
from xgboost import XGBRegressor
X = df.drop('price', axis=1)
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
models = [
    ('Random Forest', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('XGBoost', XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42))
]
for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = sqrt(mse)
    print("%s: RMSE = %f" % (name, rmse))

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3]
}
xgb_model = XGBRegressor()
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)
print("Best parameters found: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

In [None]:
test.head()

In [None]:
columns_to_merge = ['cut', 'color', 'clarity']
test['combined_column'] = test[columns_to_merge].apply(lambda x: '_'.join(x), axis=1)
test['volume'] = test['x'] * test['y'] * test['z']
print(test)

In [None]:
test.drop(['depth', 'table','Id'], axis=1, inplace=True)

In [None]:
#Converting categorical variables to numeric values.
from sklearn.preprocessing import LabelEncoder
# Initialize label encoders
cut_encoder = LabelEncoder()
color_encoder = LabelEncoder()
clarity_encoder = LabelEncoder()
combined_column_encoder = LabelEncoder()
# Fit and transform the columns
test['cut'] = cut_encoder.fit_transform(test['cut'])
test['color'] = color_encoder.fit_transform(test['color'])
test['clarity'] = clarity_encoder.fit_transform(test['clarity'])
test['combined_column'] = combined_column_encoder.fit_transform(test['combined_column'])
# Create and display mappings
cut_mapping = dict(zip(cut_encoder.classes_, cut_encoder.transform(cut_encoder.classes_)))
color_mapping = dict(zip(color_encoder.classes_, color_encoder.transform(color_encoder.classes_)))
clarity_mapping = dict(zip(clarity_encoder.classes_, clarity_encoder.transform(clarity_encoder.classes_)))
combined_column_mapping = dict(zip(combined_column_encoder.classes_, combined_column_encoder.transform(combined_column_encoder.classes_)))
print("Cut mapping:", cut_mapping)
print("Color mapping:", color_mapping)
print("Clarity mapping:", clarity_mapping)
print("combined_column mapping:",combined_column_mapping)


In [None]:
# Check the shape of the DataFrame after removing outliers
print("Shape of DataFrame after removing outliers:", test.shape)

In [None]:
print(test.head())

In [None]:
y_pred = grid_search.predict(test)

In [None]:
# Format the predictions according to the submission requirements
submission_df = pd.DataFrame({'price': y_pred})
submission_df['Id'] = test['Id'] if 'Id' in test.columns else range(1, len(submission_df) + 1)
# Reorder the columns to have 'Id' first
submission_df = submission_df[['Id', 'price']]
# Save the predictions to a CSV file
submission_df.to_csv('submission.csv', index=False)