<a href="https://colab.research.google.com/github/dilse-rasoty/Diamond_price_predictor/blob/main/diamond.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder

# Load the data
df = pd.read_csv('test1.csv')

# Drop the first column (index column)
df = df.drop(df.columns[0], axis=1)

# Convert categorical variables to numerical using Label Encoding
label_encoders = {}
categorical_cols = ['cut', 'color', 'clarity']
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Prepare features and target
X = df.drop('price', axis=1)
y = df['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate metrics
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"R-squared score: {r2:.4f}")
print(f"Mean Squared Error: {mse:.2f}")

def encode_category(encoder, category, categories):
    """Handle unseen categories during encoding"""
    if category in encoder.classes_:
        return encoder.transform([category])[0]
    else:
        # Assign to the most common category if unseen
        return encoder.transform([categories[0]])[0]

def predict_diamond_price():
    print("\nEnter diamond properties to predict its price:")

    # Get user input with no range limitations
    carat = float(input("Carat: "))
    cut = input("Cut (Fair, Good, Very Good, Premium, Ideal): ")
    color = input("Color (D (best) to J (worst)): ").upper()
    clarity = input("Clarity (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1 (best)): ").upper()
    depth = float(input("Depth (%): "))
    table = float(input("Table (%): "))
    x = float(input("x length (mm): "))
    y = float(input("y width (mm): "))
    z = float(input("z depth (mm): "))

    # Handle unseen categories
    cut_categories = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
    color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
    clarity_categories = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1']

    # Encode categorical variables with unseen category handling
    cut_encoded = encode_category(label_encoders['cut'], cut, cut_categories)
    color_encoded = encode_category(label_encoders['color'], color, color_categories)
    clarity_encoded = encode_category(label_encoders['clarity'], clarity, clarity_categories)

    # Create feature array
    features = [[carat, cut_encoded, color_encoded, clarity_encoded, depth, table, x, y, z]]

    # Make prediction
    predicted_price = model.predict(features)[0]
    print(f"\nPredicted Diamond Price: ${predicted_price:,.2f}")

# Example usage
while True:
    predict_diamond_price()
    another = input("\nPredict another diamond? (y/n): ").lower()
    if another != 'y':
        break

R-squared score: 0.9797
Mean Squared Error: 322466.53

Enter diamond properties to predict its price:
Carat: 0.25
Cut (Fair, Good, Very Good, Premium, Ideal): Ideal
Color (D (best) to J (worst)): E
Clarity (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1 (best)): SI1
Depth (%): 61.5
Table (%): 55
x length (mm): 2.95
y width (mm): 2.98
z depth (mm): 3.43





Predicted Diamond Price: $447.26

Predict another diamond? (y/n): n
