In [160]:
import pandas as pd

In [161]:
df=pd.read_csv("data/gemstone.csv")

In [162]:
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


Checking Null and Duplicate Values

In [163]:
df.isnull().sum()

id         0
carat      0
cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
price      0
dtype: int64

In [164]:
df.duplicated().sum()

0

Dropping the unneccessary columns

In [165]:
df=df.drop(['id'],axis=1)

Spliting the Independent and Dependent features . We need to divide features into Dataframe type not series

In [166]:
X=df.iloc[:,:-1]
Y=pd.DataFrame(df.iloc[:,-1])

In [167]:
# Other way of spliting Independent and Dependent Features

# X=df.drop(['price',axis=1])
#Y=df['price']

In [168]:
X.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77


In [169]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193573 entries, 0 to 193572
Data columns (total 9 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   carat    193573 non-null  float64
 1   cut      193573 non-null  object 
 2   color    193573 non-null  object 
 3   clarity  193573 non-null  object 
 4   depth    193573 non-null  float64
 5   table    193573 non-null  float64
 6   x        193573 non-null  float64
 7   y        193573 non-null  float64
 8   z        193573 non-null  float64
dtypes: float64(6), object(3)
memory usage: 13.3+ MB


In [170]:
Y.head()

Unnamed: 0,price
0,13619
1,13387
2,2772
3,666
4,14453


In [171]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder

Separating Categorical and Numerical features of Independent Features as we do not want to include Output feature i.e Dependent Features

In [172]:
numerical_features=X.columns[X.dtypes !='object']
categorical_features=X.columns[X.dtypes =='object']
print(numerical_features)
print(categorical_features)

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')
Index(['cut', 'color', 'clarity'], dtype='object')


In [173]:
# Other way of separating Categorical and Numerical features
# categorical_cols = X.select_dtypes(include='object').columns
# numerical_cols = X.select_dtypes(exclude='object').columns

In [174]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

Creating a list of categories in a sequence according to their ranks

In [175]:
print(df.cut.unique())
print(df.color.unique())
print(df.clarity.unique())

['Premium' 'Very Good' 'Ideal' 'Good' 'Fair']
['F' 'J' 'G' 'E' 'D' 'H' 'I']
['VS2' 'SI2' 'VS1' 'SI1' 'IF' 'VVS2' 'VVS1' 'I1']


For e.g in cut categories the Ideal has a rank of 5 which is highest rank

In [176]:
cut_categories=['Fair','Good','Very Good','Premium','Ideal']
color_categories=['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories=['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [177]:
numerical_pipeline=Pipeline(
    steps=[
        ('Imputer',SimpleImputer(strategy='mean')),
        ('Scaler',StandardScaler()),
        ]
)

categorical_pipeline=Pipeline(
    steps=[
        ('Imputer',SimpleImputer(strategy='most_frequent')),
        ('Ordinal',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
        ('Scaler',StandardScaler())
        ]
)

processor=ColumnTransformer([
    ('num_pipe',numerical_pipeline,numerical_features),
    ('category_pipe',categorical_pipeline,categorical_features)
])

In [178]:
from sklearn.model_selection  import train_test_split

In [179]:
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=42)

In [180]:
X_train_scaled =processor.fit_transform(X_train)
X_test_scaled =processor.transform(X_test)

In [189]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

In [190]:
Linear=LinearRegression()
Lasso=Lasso()
Ridge=Ridge()

In [191]:
Linear.fit(X_train_scaled, y_train)

In [192]:
Lasso.fit(X_train_scaled, y_train)

In [193]:
Ridge.fit(X_train_scaled, y_train)

In [194]:
Linear_predictions = Linear.predict(X_test_scaled)
Lasso_predictions = Lasso.predict(X_test_scaled)
Ridge_predictions = Ridge.predict(X_test_scaled)

In [196]:
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

In [None]:
for i in 