In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('diamonds.csv')

df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [4]:
inputs=df[['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z']]
inputs.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75


In [5]:
target=df[['price']]
target.head()

Unnamed: 0,price
0,326
1,326
2,327
3,334
4,335


In [6]:
# split into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(inputs, target, train_size=0.75, random_state=100)

In [7]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(40455, 9) (40455, 1)
(13485, 9) (13485, 1)


In [8]:
df_num = X_train.select_dtypes(include=['int64', 'float64'])

df_num.head()

Unnamed: 0,carat,depth,table,x,y,z
33169,0.3,61.9,54.1,4.28,4.33,2.67
53170,0.71,59.5,62.0,5.85,5.74,3.45
27846,0.3,62.2,56.0,4.27,4.31,2.67
20880,1.02,62.9,59.0,6.42,6.37,4.02
28554,0.3,60.1,61.0,4.32,4.3,2.59


In [9]:
# scaling the numerical features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

df_num_tra = pd.DataFrame(scaler.fit_transform(df_num), columns = df_num.columns)

df_num_tra.head()

Unnamed: 0,carat,depth,table,x,y,z
0,-1.050294,0.099874,-1.495687,-1.293573,-1.220387,-1.249413
1,-0.186302,-1.57225,2.031741,0.10509,0.003601,-0.128311
2,-1.050294,0.30889,-0.647318,-1.302482,-1.237749,-1.249413
3,0.46696,0.796592,0.692211,0.612885,0.550489,0.690955
4,-1.050294,-1.154219,1.585231,-1.257938,-1.24643,-1.364397


In [12]:
df_num_tes = pd.DataFrame(scaler.transform(df_num_test), columns = df_num_test.columns)

df_num_tes.head()

Unnamed: 0,carat,depth,table,x,y,z
0,-0.481324,-0.178813,-0.200809,-0.340344,-0.360992,-0.372654
1,0.761982,-0.178813,-1.093828,0.906871,0.932443,0.906552
2,-0.607762,1.005608,0.245701,-0.607605,-0.56933,-0.487639
3,-0.797419,-0.80586,-0.647318,-0.803596,-0.751626,-0.861339
4,0.003355,0.587577,0.245701,0.140725,0.168535,0.231016


In [13]:


y_train = pd.DataFrame(scaler.fit_transform(y_train), columns = y_train.columns)

y_train.head()

Unnamed: 0,price
0,-0.781729
1,-0.328443
2,-0.823255
3,1.29108
4,-0.817251


In [15]:
df_cat_tra = X_train.select_dtypes(include=['object'])

df_cat_tra.head()

Unnamed: 0,cut,color,clarity
33169,Ideal,D,VS1
53170,Premium,E,SI1
27846,Ideal,D,VS1
20880,Premium,F,VVS2
28554,Premium,H,VS1


In [17]:
cut_encoder = {'Fair' : 1, 'Good' : 2, 'Very Good' : 3, 'Ideal' : 4, 'Premium' : 5}

df_cat['cut'] = df_cat_tra['cut'].apply(lambda x : cut_encoder[x])

color_encoder = {'J':1, 'I':2, 'H':3, 'G':4, 'F':5, 'E':6, 'D':7}

df_cat['color'] = df_cat_tra['color'].apply(lambda x : color_encoder[x])

clarity_encoder = {'I1':1, 'SI2':2, 'SI1':3, 'VS2':4, 'VS1':5, 'VVS2':6, 'VVS1':7, 'IF':8}

df_cat['clarity'] = df_cat_tra['clarity'].apply(lambda x : clarity_encoder[x])

df_cat.head()

Unnamed: 0,cut,color,clarity
33169,4,7,5
53170,5,6,3
27846,4,7,5
20880,5,5,6
28554,5,3,5


In [18]:
X_train = pd.merge(df_num_tra, df_cat,left_index = True, right_index= True)

X_train.head()

Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
0,-1.050294,0.099874,-1.495687,-1.293573,-1.220387,-1.249413,4,6,2
1,-0.186302,-1.57225,2.031741,0.10509,0.003601,-0.128311,5,6,3
2,-1.050294,0.30889,-0.647318,-1.302482,-1.237749,-1.249413,2,6,5
4,-1.050294,-1.154219,1.585231,-1.257938,-1.24643,-1.364397,2,1,2
5,0.909493,0.239218,-0.691969,0.969232,0.967166,1.007163,3,1,6


In [11]:
df_num_test = X_test.select_dtypes(include=['int64', 'float64'])

df_num_test.head()

Unnamed: 0,carat,depth,table,x,y,z
52264,0.57,61.5,57.0,5.35,5.32,3.28
21073,1.16,61.5,55.0,6.75,6.81,4.17
42161,0.51,63.2,58.0,5.05,5.08,3.2
35974,0.42,60.6,56.0,4.83,4.87,2.94
7641,0.8,62.6,58.0,5.89,5.93,3.7


In [19]:
df_cat = X_test.select_dtypes(include=['object'])

cut_encoder = {'Fair' : 1, 'Good' : 2, 'Very Good' : 3, 'Ideal' : 4, 'Premium' : 5}

df_cat['cut'] = df_cat['cut'].apply(lambda x : cut_encoder[x])

color_encoder = {'J':1, 'I':2, 'H':3, 'G':4, 'F':5, 'E':6, 'D':7}

df_cat['color'] = df_cat['color'].apply(lambda x : color_encoder[x])

clarity_encoder = {'I1':1, 'SI2':2, 'SI1':3, 'VS2':4, 'VS1':5, 'VVS2':6, 'VVS1':7, 'IF':8}

df_cat['clarity'] = df_cat['clarity'].apply(lambda x : clarity_encoder[x])

df_cat.head()

Unnamed: 0,cut,color,clarity
52264,4,6,4
21073,4,4,5
42161,4,4,3
35974,4,5,5
7641,5,4,8


In [20]:
X_test = pd.merge(df_num_tes, df_cat,left_index = True, right_index= True)

X_test

Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
3,-0.797419,-0.805860,-0.647318,-0.803596,-0.751626,-0.861339,5,2,4
7,-0.207375,1.005608,1.138721,-0.099810,-0.161334,-0.013326,3,3,3
8,0.719836,-1.084547,0.245701,0.871236,0.906400,0.748448,1,6,4
21,-0.755273,0.239218,-1.093828,-0.759052,-0.716903,-0.717608,3,6,4
22,-0.797419,-0.039470,-0.200809,-0.812504,-0.812391,-0.818220,3,3,5
...,...,...,...,...,...,...,...,...,...
13461,0.572325,0.030202,-0.647318,0.755424,0.776189,0.777194,4,5,3
13464,-0.460251,-0.109141,-1.093828,-0.358162,-0.308907,-0.343908,5,4,2
13469,-0.818492,0.796592,0.245701,-0.919408,-0.855795,-0.818220,5,1,6
13475,1.520609,-0.875531,1.138721,1.557205,1.444608,1.380864,5,4,2


In [21]:
X_train

Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
0,-1.050294,0.099874,-1.495687,-1.293573,-1.220387,-1.249413,4,6,2
1,-0.186302,-1.572250,2.031741,0.105090,0.003601,-0.128311,5,6,3
2,-1.050294,0.308890,-0.647318,-1.302482,-1.237749,-1.249413,2,6,5
4,-1.050294,-1.154219,1.585231,-1.257938,-1.246430,-1.364397,2,1,2
5,0.909493,0.239218,-0.691969,0.969232,0.967166,1.007163,3,1,6
...,...,...,...,...,...,...,...,...,...
40447,0.319449,0.726921,0.692211,0.425802,0.446320,0.532851,4,5,2
40448,0.593398,0.796592,-1.093828,0.764332,0.698062,0.849059,5,5,5
40452,0.235157,0.030202,0.245701,0.452528,0.368193,0.417866,4,5,7
40453,0.951639,0.448233,0.245701,1.031593,0.967166,1.079029,4,5,7


In [22]:
from math import sqrt
distance = 0.0
    for i in range(len(row1)-1):
        distance += (row1[i] - row2[i])**2
    return sqrt(distance)


IndentationError: unexpected indent (<ipython-input-22-536e9bc0b1b3>, line 3)

In [27]:
def Euclidean_Dist(df1, df2, cols=['x_coord','y_coord']):
    return np.linalg.norm(df1[cols].values - df2[cols].values,
                   axis=1)

In [28]:
df1 = pd.DataFrame({'user_id':[214,214,214],
                'x_coord':[-55.2,-55.2,-55.2],
                'y_coord':[22.1,22.1,22.1]})

df2 = pd.DataFrame({'user_id':[512, 362, 989],
                    'x_coord':[-15.2, 65.1, -84.8],
                    'y_coord':[19.1, 71.4, 13.7]})

Euclidean_Dist(df1, df2)

array([ 40.11234224, 130.0099227 ,  30.76881538])

In [29]:
X_train.columns

Index(['carat', 'depth', 'table', 'x', 'y', 'z', 'cut', 'color', 'clarity'], dtype='object')

In [30]:
def Euclidean_Dist(df1, df2, cols=['carat', 'depth', 'table', 'x', 'y', 'z', 'cut', 'color', 'clarity']):
    return np.linalg.norm(df1[cols].values - df2[cols].values,
                   axis=1)

In [31]:
Euclidean_Dist(X_train,X_test)

ValueError: operands could not be broadcast together with shapes (30423,9) (3435,9) 