# DIAMONDS PROJECT

In [1]:
import pandas as pd
import math
import matplotlib.pyplot as pls 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.impute import SimpleImputer

In [2]:
train= pd.read_csv('./diamonds_train.csv')
test= pd.read_csv('./diamonds_test.csv')
sample= pd.read_csv('./sample_submission.csv')

## 1. Delete Train Zeros

##### There are several columns (x,y,z) with 0's and I have to replace them. The first step it's to replace the ones where only the 'z' misses. For that I have applied a formula to get the z.

In [3]:
def change_z(df):
    
    x=df[0]
    y=df[1]
    z=df[2]
    depth= df[3]
    
    if z== 0.0 and x!=0 and y!=0:
        
        z=((depth/100)*(x+y))/2
        
        return z
    else:
        return df[2]

In [4]:
train['z']= train[['x','y', 'z', 'depth']].apply(change_z, axis=1)
test['z']= test[['x','y', 'z', 'depth']].apply(change_z, axis=1)


##### To replace the other zeros, as I realize there where not many, I used a tecnique which it´s not the most pythonic...

In [5]:
change_3zeros= train.loc[(train.carat.between(0.68,0.73)) & (train.depth.between(63,65))& (train.price.between(2080,2180)) & (train.table.between(59,61))]
change_3zeros.reset_index(drop=True, inplace=True)
to_del=[2,10]
change_3zeros.drop(change_3zeros.index[to_del], inplace=True)
change_3zeros.reset_index(drop=True, inplace=True)

train.at[6465,'x']= 5.61
train.at[6465,'y']= 5.58
train.at[6465,'z']= 3.56
train.at[28029,'x']= 5.61
train.at[28029,'y']= 5.58
train.at[28029,'z']= 3.56
train.at[26192,'x']=8.45
train.at[26192,'y']=8.39
train.at[26192,'z']=5.17


#train.loc[(train.x==0)|(train.y==0)|(train.z==0)]
#zero_x=train.loc[(train.carat.between(1.05,1.10))&(train.depth.between(61,62))& (train.price.between(4900,5000)) & (train.table.between(54,58))&(train.x!=0)]
#z=((61.6/100)*(6.60+4.07))/2
train.at[14815,'z']=3.29
train.at[14815,'x']=6.60

last_df=train.loc[(train.carat.between(1.10,1.16)&(train.depth.between(53,59))& (train.price.between(5300,7300)) & (train.table.between(60,72))&(train.x!=0))]

train.at[34423,'x']=6.79
train.at[34423,'y']=6.79
train.at[34423,'z']=3.97


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


## 2. Delete test zeros

In [6]:
test_1df=test.loc[(test.carat.between(1.15,1.25)&(test.depth.between(61,63)) & (test.table.between(58,60))&(test.x!=0))]

test.at[2901,'x']=6.81
test.at[2901,'y']=6.79
test.at[2901,'z']=4.21

test_2df=test.loc[(test.carat.between(0.98,1.02)&(test.depth.between(62.3,64.3)) & (test.table.between(51,55))&(test.x!=0))]

test.at[6685,'x']=6.38
test.at[6685,'y']=6.37
test.at[6685,'z']=4.02

test_3df= test.loc[(test.carat.between(1.54,1.58)&(test.depth.between(62,62.5)) & (test.table.between(52,56))&(test.x!=0))]

test.at[7488,'x']=7.41
test.at[7488,'y']=7.44
test.at[7488,'z']=4.62


## 3. New Column -> Shape

##### Each diamond has a shape according to it´s cut, depth, table and the length-width ratio. I got from lumeradiamonds.com the shapes of the diamonds. 


In [7]:
def shapes(df):
    
    cut= df[0]
    depth= df[1]
    table= df[2]
    ratio= df[3]/df[4]
    
    
    ##IDEAL & PREMIUM
    
    if cut == 'Ideal' or cut == 'Premium':
        
        if (53 <=table <= 58) and (59 <=depth<= 62.3) and (1<=ratio<= 1.01):
            return 'Round'
        
        
        elif (61 <=table <= 67) and (61 <=depth<= 67) and (1.00<=ratio<= 1.03 or 1.15<=ratio<=1.20):
            return 'Cushion'
        
        elif (61 <=table <= 69) and (61 <=depth<= 67) and (1.00<=ratio<= 1.03 or 1.4<=ratio<=1.50):
            return 'Emerald'
        
        elif (53<=table <= 63) and (58 <=depth<= 62) and (0.95<=ratio<= 1.02):
            return 'Heart'
        
        elif (0.95<=ratio<= 1.05):
            return 'Round'
        
        else:
            return 'Unknown'
        
        ## VERY GOOD
    
    elif cut == 'Very Good':
        
        if (52 <=table <= 53 or 58 <=table <= 60) and (58 <=depth<= 58.9 or 62.4 <=depth <= 63.5) and (1<=ratio<= 1.01):
            return 'Round'
        
        elif (58<=table<=60 or 68 <=table <= 70) and (58 <=depth<= 60.9 or  67.1<=depth<= 70) and (1.00<=ratio<= 1.03 or 1.10<=ratio<= 1.14 or 1.21<=ratio<= 1.30):
            return 'Cushion'
        
        elif (57<=table<=60 or 70 <=table <= 72) and (59 <=depth<= 60.9 or  67.1<=depth<= 70) and (1.00<=ratio<= 1.03 or 1.30<=ratio<= 1.39 or 1.51<=ratio<= 1.60):
            return 'Emerald'
        
        elif (table== 52 or 64 <=table <= 65) and (56<=depth<=57.9 or 62.1 <=depth <= 66) and (0.89<=ratio<= 0.94 or 1.03<=ratio<= 1.05):
            return 'Heart' 
        
        elif (0.90<=ratio<= 1.05):
            return 'Round'
              
        else:
            return 'Unknown'
              
              ##GOOD 
                
    elif cut == 'Good':
        
        if (table == 51 or 61 <=table <= 64) and (57.5 <=depth<= 57.9 or 63.6 <=depth <= 64.1) and (1<=ratio<= 1.01):
            return 'Round'
        
        elif (table == 71 or 56<=table<=57) and (56 <=depth<= 57.9 or 70.1<=depth<= 71) and (1.04<=ratio<= 1.05 or 1.10<=ratio<= 1.14 or 1.21<=ratio<= 1.30):
            return 'Cushion'
        
        elif (54<=table<=56 or 73 <=table <= 74) and (57 <=depth<= 58.9 or  70.1<=depth<= 74) and (1.04<=ratio<= 1.05 or 1.20<=ratio<= 1.29 or 1.61<=ratio<= 1.80):
            return 'Emerald'
        
        elif (table== 51 or 66 <=table <= 68) and (53<=depth<=55.9 or 66.1 <=depth <= 71) and (0.83<=ratio<= 0.87 or 1.06<=ratio<= 1.10):
            return 'Heart' 
        
        elif (0.90<=ratio<= 1.05):
            return 'Round'
        
              
        else:
            return 'Unknown'
        
        ##FAIR
        
    elif cut == 'Fair':
        
        if (table == 50 or 65 <=table <= 69) and (56.5 <=depth<= 57.4 or 64.2 <=depth <= 65) and ratio==1.02:
            return 'Round'
        
        
        elif (54<=table<=57 or  72<=table<= 73) and (54 <=depth<= 55.9 or  71.1<=depth<= 73) and (1.06<=ratio<= 1.09 or 1.31<=ratio<= 1.50):
            return 'Cushion'
        
        elif (51<=table<=53 or 75 <=table <= 79) and (54 <=depth<= 56.9 or  74.1<=depth<= 79) and (1.06<=ratio<= 1.08 or 1.15<=ratio<= 1.19 or 1.81<=ratio<= 1.90):
            return 'Emerald'
        
        
        elif (table== 50 or 69 <=table <= 70) and (50<=depth<=52.9 or 71.1 <=depth <= 74) and (0.80<=ratio<= 0.83 or 1.11<=ratio<= 1.15):
            return 'Heart'  
        
        elif (0.90<=ratio<= 1.05):
            return 'Round'
              
        else:
            return 'Unknown'

In [8]:
train['shape']= train[['cut','depth', 'table','x', 'y']].apply(shapes, axis=1)

In [9]:
test['shape']= test[['cut','depth', 'table','x', 'y']].apply(shapes, axis=1)


In [10]:
train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,shape
0,1.21,Premium,J,VS2,62.4,58.0,4268,6.83,6.79,4.25,Round
1,0.32,Very Good,H,VS2,63.0,57.0,505,4.35,4.38,2.75,Round
2,0.71,Fair,G,VS1,65.5,55.0,2686,5.62,5.53,3.65,Round
3,0.41,Good,D,SI1,63.8,56.0,738,4.68,4.72,3.0,Round
4,1.02,Ideal,G,SI1,60.5,59.0,4882,6.55,6.51,3.95,Heart


In [11]:
target= 'price'

cat_features = ['cut', 'color', 'clarity', 'shape']
num_features = ['carat', 'depth', 'table', 'x', 'y', 'z']

for cat in cat_features:
    train[cat]= train[cat].astype('category')
    test[cat]= test[cat].astype('category')



In [12]:
cut_change= {'Fair':0 , 'Good':1, 'Very Good':2 , 'Premium': 3, 'Ideal':4}
train['cut']= train['cut'].map(cut_change)
test['cut']= test['cut'].map(cut_change)

color_change= {'J':0 , 'I':1, 'H':2 , 'G': 3, 'F':4, 'E':5, 'D':6}
train['color']= train['color'].map(color_change)
test['color']= test['color'].map(color_change)

clarity_change= {'I1':0 , 'SI2':1, 'SI1':2 , 'VS2': 3, 'VS1':4, 'VVS2':5, 'VVS1':6, 'IF':7}
train['clarity']= train['clarity'].map(clarity_change)
test['clarity']= test['clarity'].map(clarity_change)

shape= {'Unknown':0 , 'Heart':1, 'Emerald':2 , 'Cushion': 3, 'Round':4}
train['shape']= train['shape'].map(shape)
test['shape']= test['shape'].map(shape)



In [13]:
to_del= ['price']
train_df= train.drop(to_del,1)
train_df

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,shape
0,1.21,3,0,3,62.4,58.0,6.83,6.79,4.25,4
1,0.32,2,2,3,63.0,57.0,4.35,4.38,2.75,4
2,0.71,0,3,4,65.5,55.0,5.62,5.53,3.65,4
3,0.41,1,6,2,63.8,56.0,4.68,4.72,3.00,4
4,1.02,4,3,2,60.5,59.0,6.55,6.51,3.95,1
...,...,...,...,...,...,...,...,...,...,...
40450,1.34,4,3,4,62.7,57.0,7.10,7.04,4.43,4
40451,2.02,1,4,1,57.1,60.0,8.31,8.25,4.73,4
40452,1.01,4,2,2,62.7,56.0,6.37,6.42,4.01,4
40453,0.33,4,0,4,61.9,54.3,4.45,4.47,2.76,1


In [14]:
features= list(train_df.columns)
target= 'price'

## Predict the price - Machine Learning

In [15]:
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), 
                ('scaler', StandardScaler())])
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, features)])

model= Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', LGBMRegressor())])

In [16]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean'],
    'regressor__n_estimators': [256],
    'regressor__max_depth': [8],
    'regressor__num_leaves': [32],
    'regressor__learning_rate':[0.1],
     }

In [17]:
grid_search = RandomizedSearchCV(model, 
                                 param_grid,  
                                 verbose=5, 
                                 scoring='neg_root_mean_squared_error', 
                                 n_jobs=-1,
                                 n_iter=500)

In [18]:
grid_search.fit(train_df[features], train[target])

Fitting 5 folds for each of 1 candidates, totalling 5 fits




RandomizedSearchCV(estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('num',
                                                                               Pipeline(steps=[('imputer',
                                                                                                SimpleImputer(strategy='median')),
                                                                                               ('scaler',
                                                                                                StandardScaler())]),
                                                                               ['carat',
                                                                                'cut',
                                                                                'color',
                                                                                'clarity',
                                   

In [19]:
grid_search.best_params_

{'regressor__num_leaves': 32,
 'regressor__n_estimators': 256,
 'regressor__max_depth': 8,
 'regressor__learning_rate': 0.1,
 'preprocessor__num__imputer__strategy': 'mean'}

In [20]:
grid_search.best_score_

-526.8778592761441

In [21]:
y_pred= grid_search.predict(test[features])

In [22]:
submission = pd.DataFrame({'id': test['id'], 'price': y_pred})
submission.to_csv('./best_one.csv', index=False)