# Import libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# Read dataset

In [2]:
diamonds_test = pd.read_csv('data/diamonds.csv')
diamonds_rick = pd.read_csv('data/rick_diamonds.csv')
diamonds_test

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
1,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
2,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
3,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
4,0.24,Very Good,J,VVS2,62.8,57.0,336,3.94,3.96,2.48
...,...,...,...,...,...,...,...,...,...,...
48935,0.72,Premium,D,SI1,62.7,59.0,2757,5.69,5.73,3.58
48936,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
48937,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
48938,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56


# Data Cleaning 

In [3]:
# removendo as colunas de x,y,z que estiverem todas zeradas
diamonds_test = diamonds_test[~((diamonds_test['x'] == 0) & (diamonds_test['y'] == 0) & (diamonds_test['z'] == 0))]

In [4]:
# remoção das colunas com x =0, como todas nessas colunas y ou z também era =0
diamonds_test = diamonds_test[~(diamonds_test['x'] == 0)]

In [5]:
# não havia colunas y = 0, logo remoção desnecessária
diamonds_test[diamonds_test['y'] == 0]

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z


In [6]:
# completando as informações perdidas quando o z = 0
diamonds_test.loc[diamonds_test['z']==0 ,'z'] = (diamonds_test['depth']*(diamonds_test['x']+diamonds_test['y']))/2

In [7]:
# buscando valores faltando em outras colunas
diamonds_test[diamonds_test['price'] == 0]

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z


In [8]:
diamonds_test[diamonds_test['table'] == 0]

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z


In [9]:
diamonds_test['clarity'].unique()

array(['SI1', 'VS1', 'VS2', 'SI2', 'VVS2', 'VVS1', 'I1', 'IF'],
      dtype=object)

In [10]:
diamonds_test['color'].unique()

array(['E', 'I', 'J', 'H', 'F', 'G', 'D'], dtype=object)

In [11]:
diamonds_test['cut'].unique()

array(['Premium', 'Good', 'Very Good', 'Ideal', 'Fair'], dtype=object)

# Exploratory Analysis

In [12]:
diamonds_test.corr()

Unnamed: 0,carat,depth,table,price,x,y,z
carat,1.0,0.027063,0.181726,0.922207,0.977852,0.951945,0.127266
depth,0.027063,1.0,-0.296986,-0.012077,-0.025812,-0.029845,0.003506
table,0.181726,-0.296986,1.0,0.127854,0.196061,0.184042,0.020788
price,0.922207,-0.012077,0.127854,1.0,0.887904,0.866568,0.112812
x,0.977852,-0.025812,0.196061,0.887904,1.0,0.972708,0.12441
y,0.951945,-0.029845,0.184042,0.866568,0.972708,1.0,0.120974
z,0.127266,0.003506,0.020788,0.112812,0.12441,0.120974,1.0


# Data transformation

# Modeling

In [13]:
X = diamonds_test[['carat']]
y = diamonds_test['price']

model = LinearRegression()
model.fit(X,y)

price_pred = model.predict(diamonds_rick[['carat']])
diamonds_rick['price_predicted'] = price_pred

# To CSV

In [14]:
diamonds_rick.to_csv('testes_price/price_pred', index=False)
diamonds_rick

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price_predicted
0,0.91,Very Good,H,SI2,63.4,57.0,6.10,6.18,3.89,4804.544888
1,0.38,Good,F,VS2,60.3,61.0,4.68,4.71,2.83,692.545233
2,0.52,Ideal,H,VS2,61.8,56.0,5.19,5.16,3.20,1778.733821
3,1.10,Ideal,G,VS1,61.3,56.0,6.65,6.69,4.09,6278.657971
4,2.28,Ideal,G,SI2,61.6,57.0,8.44,8.37,5.18,15433.676069
...,...,...,...,...,...,...,...,...,...,...
4995,0.30,Very Good,D,SI2,62.2,59.0,4.25,4.30,2.66,71.866040
4996,0.51,Ideal,F,VVS2,61.9,56.0,5.09,5.14,3.16,1701.148922
4997,1.01,Very Good,F,SI1,60.6,60.0,6.37,6.43,3.88,5580.393879
4998,0.31,Ideal,F,VVS1,62.1,56.0,4.35,4.38,2.71,149.450939


# Results


In [15]:
# erro usando a média foi 3980.71
# erro usando a mediana foi 4255.53
# erro usando a primeira modelagem foi 1605.15
# erro após a limpeza dos dados foi  1605.13