In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'diamond-price-prediciton-2024:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F77458%2F8457230%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240515%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240515T210655Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D99f27b77d5090ab993441be4e5958f85f540c27f7e30766653235aa1d8b2dee24e1b2ca5e8a1dfcc924ada47dbb2de87284fad9bdbd2aa0b818f1e7ffd65c22e587723f146a01cda2b0958466c256ab1946c33907f20f3044b876042b9fa78dbc4a83e508d65fbf78cab7da4f769c71650e9a2dc7a45849f2406aae15f506831a43229561ef149ad69821170fa36c4611724a5e955aebb723cf2f13e179e0e5d863f540b19be7486c0ed862710abe3610596d96e3871904664cededb82c804508505d35723f6eb0fe8ea7011ff943604767f3a59c1f9e13f9875b0f9d9bd6707550cfc6b2e44e3d48487303ac536bd9d74fad6e61c4cfc1550f8778e8058afe3'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading diamond-price-prediciton-2024, 885571 bytes compressed
Downloaded and uncompressed: diamond-price-prediciton-2024
Data source import complete.


In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/diamond-price-prediciton-2024/train.csv
/kaggle/input/diamond-price-prediciton-2024/test.csv


In [3]:
data=pd.read_csv('/kaggle/input/diamond-price-prediciton-2024/train.csv')
test_data=pd.read_csv('/kaggle/input/diamond-price-prediciton-2024/test.csv')


In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn import metrics

In [5]:

# Drop the 'Id' column from training data
data = data.drop(columns=['Id'])


In [6]:
#making sure that the data contain no null value
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43152 entries, 0 to 43151
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    43152 non-null  float64
 1   cut      43152 non-null  object 
 2   color    43152 non-null  object 
 3   clarity  43152 non-null  object 
 4   depth    43152 non-null  float64
 5   table    43152 non-null  float64
 6   price    43152 non-null  int64  
 7   x        43152 non-null  float64
 8   y        43152 non-null  float64
 9   z        43152 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 3.3+ MB


In [10]:
data.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,43152.0,43152.0,43152.0,43152.0,43152.0,43152.0,43152.0
mean,0.797855,61.747177,57.458347,3929.491912,5.731568,5.735018,3.538568
std,0.473594,1.435454,2.233904,3985.527795,1.121279,1.148809,0.708238
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,947.75,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5312.0,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [11]:
#Dropping dimentionless diamonds, cuz I can Have diamonds with 0 dimentionl,so I remove them
data = data.drop(data[data["x"]==0].index)
data = data.drop(data[data["y"]==0].index)
data = data.drop(data[data["z"]==0].index)
data.shape

(43135, 10)

In [12]:
# Get list of categorical variables,the resone I do this, is to trainsform the string values into numeric, that make the process easier to my model
s = (data.dtypes =="object")
object_cols = list(s[s].index)
print("Categorical variables:")
print(object_cols)

Categorical variables:
['cut', 'color', 'clarity']


In [13]:
# Make copy to avoid changing original data
label_data = data.copy()

# Apply label encoder to each column with categorical data
label_encoder = LabelEncoder()
for col in object_cols:
    label_data[col] = label_encoder.fit_transform(label_data[col])
label_data.head()



Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1.06,2,5,3,61.8,57.0,4270,6.57,6.6,4.07
1,1.51,3,3,7,60.9,58.0,15164,7.38,7.42,4.51
2,0.32,2,2,5,61.3,56.0,828,4.43,4.41,2.71
3,0.53,2,3,5,61.2,56.0,1577,5.19,5.22,3.19
4,0.7,3,4,7,61.0,57.0,2596,5.76,5.72,3.5


In [23]:
# Combine training and test data for consistent label encoding
#The goal of this merging is to ensure that the same classes in the training and test data get the same codes.

combined_data = pd.concat([data[object_cols]])

# Fit label encoder on combined data
label_encoder = LabelEncoder()
for col in object_cols:
    label_encoder.fit(combined_data[col])
    data[col] = label_encoder.transform(data[col])
    test_data[col] = label_encoder.transform(test_data[col])



In [24]:
data.describe()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
count,43135.0,43135.0,43135.0,43135.0,43135.0,43135.0,43135.0,43135.0,43135.0,43135.0
mean,0.797578,2.555164,2.593972,3.831645,61.747386,57.457925,3927.668691,5.731757,5.73507,3.539962
std,0.473325,1.027043,1.701973,1.724911,1.435091,2.233401,3983.324154,1.119336,1.147272,0.704884
min,0.2,0.0,0.0,0.0,43.0,43.0,326.0,3.73,3.68,1.07
25%,0.4,2.0,1.0,2.0,61.0,56.0,947.0,4.71,4.72,2.91
50%,0.7,2.0,3.0,4.0,61.8,57.0,2400.0,5.7,5.71,3.53
75%,1.04,3.0,4.0,5.0,62.5,59.0,5311.0,6.54,6.54,4.04
max,5.01,4.0,6.0,7.0,79.0,95.0,18823.0,10.74,58.9,31.8


In [25]:
# Assigning the featurs as X and trarget as y
X= label_data.drop(["price"],axis =1)
y= label_data["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=42)

Now!
let's train the model
Using the pipline


1.   I used the Random forest , cuz after trained many models, I found out that the best scors was given by the Rf,and it can deal with the non-linear data
2.   The XGBoots also gave a good score, but I choose in the end the Rf
the diffrence was not that big bitween them



In [26]:
pipeline_rf=Pipeline([("scalar3",StandardScaler()),("rf_classifier",RandomForestRegressor())])

pred= pipeline_rf.fit(X_train, y_train)
pred

# Now after training the model , it appears some good values on the training data , but not the same thing for the test data, it looks like overfitting
# let's use the cross validation to solve the overfitting, and calculating the mean square error

In [27]:
cv_score = cross_val_score(pipeline_rf, X_train,y_train,scoring="neg_root_mean_squared_error", cv=10)
print("RandomForest:",cv_score.mean())

RandomForest: -568.8786917047755


# **Finally,Testing the model**

In [28]:
# Model prediction on test data
pred = pipeline_rf.predict(X_test)
pred

array([ 640.3       , 5599.14      ,  557.81666667, ...,  860.02      ,
       7589.31      , 1994.88      ])

In [30]:
# Preprocessing test_data similarly to training data
# Dropping dimensionless diamonds in test data
test_data = test_data.drop(test_data[test_data["x"] == 0].index)
test_data = test_data.drop(test_data[test_data["y"] == 0].index)
test_data = test_data.drop(test_data[test_data["z"] == 0].index)


In [31]:
test_data.describe()

Unnamed: 0,Id,carat,cut,color,clarity,depth,table,x,y,z
count,10785.0,10785.0,10785.0,10785.0,10785.0,10785.0,10785.0,10785.0,10785.0,10785.0
mean,5394.847844,0.798179,2.544738,2.594251,3.850626,61.758025,57.452471,5.731108,5.734154,3.540383
std,3114.534979,0.475693,1.030356,1.698545,1.723096,1.421274,2.236813,1.119821,1.111137,0.693069
min,1.0,0.2,0.0,0.0,0.0,54.0,50.0,3.73,3.71,2.31
25%,2698.0,0.4,2.0,1.0,2.0,61.1,56.0,4.72,4.73,2.91
50%,5395.0,0.7,2.0,3.0,4.0,61.8,57.0,5.7,5.71,3.52
75%,8092.0,1.04,3.0,4.0,5.0,62.5,59.0,6.54,6.54,4.03
max,10788.0,4.5,4.0,6.0,7.0,79.0,73.0,10.23,10.16,6.72


In [None]:
# Drop the 'Id' column from test data if it exists
if 'Id' in test_data.columns:
    test_data = test_data.drop(columns=['Id'])



In [37]:

# Making predictions on test data
test_pred = pipeline_rf.predict(test_data)

# Saving predictions to a CSV file
submission = pd.DataFrame({
    "price": test_pred
})
submission.to_csv("/kaggle/working/submission.csv", index=False)

submission

Unnamed: 0,Id,price
0,0,935.62
1,1,2893.86
2,2,832.38
3,3,2821.77
4,4,1089.57
...,...,...
10780,10783,1706.97
10781,10784,6753.62
10782,10785,4520.70
10783,10786,5003.28
