In [1]:
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("diamonds_data.csv")
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
df2 = pd.read_csv("diamonds_submission.csv")
df2.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,0.33,Ideal,H,VS1,61.8,56.0,4.41,4.46,2.74
1,1,0.41,Ideal,G,VS1,61.2,57.0,4.77,4.8,2.93
2,2,0.35,Ideal,G,IF,61.9,54.0,4.56,4.58,2.83
3,3,0.91,Ideal,F,SI2,60.9,57.0,6.27,6.3,3.83
4,4,0.9,Very Good,F,VS2,63.2,58.0,6.11,6.16,3.88


In [4]:
# Entrenar el modelo una primera vez con el 80-20 y luego una segunda vez con todos los datos

In [5]:
linereg = LinearRegression(fit_intercept = True) 

In [6]:
X = df.drop(columns = ["price"])
y = df.price

In [7]:
X.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75


In [8]:
y.head()

0    326
1    326
2    327
3    334
4    335
Name: price, dtype: int64

In [9]:
cut = {"Fair": 0, "Good": 1, "Very Good": 2, "Premium": 3, "Ideal": 4}

In [10]:
def labeling(s, dic):
    return dic[s]

In [11]:
X.cut = X.cut.apply(lambda x: labeling(x, cut))

In [12]:
X.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.23,4,E,SI2,61.5,55.0,3.95,3.98,2.43
1,0.21,3,E,SI1,59.8,61.0,3.89,3.84,2.31
2,0.23,1,E,VS1,56.9,65.0,4.05,4.07,2.31
3,0.29,3,I,VS2,62.4,58.0,4.2,4.23,2.63
4,0.31,1,J,SI2,63.3,58.0,4.34,4.35,2.75


In [13]:
X = X._get_numeric_data() # Con esta función lo que se está haciendo es eliminar aquellas columnas que no son numéricas
X.head()

Unnamed: 0,carat,cut,depth,table,x,y,z
0,0.23,4,61.5,55.0,3.95,3.98,2.43
1,0.21,3,59.8,61.0,3.89,3.84,2.31
2,0.23,1,56.9,65.0,4.05,4.07,2.31
3,0.29,3,62.4,58.0,4.2,4.23,2.63
4,0.31,1,63.3,58.0,4.34,4.35,2.75


In [14]:
X["Vol"] = X.x*X.y*X.z # Hay que comprender las características de los datos y jugar con ellos para mejorar el error
X.head()

Unnamed: 0,carat,cut,depth,table,x,y,z,Vol
0,0.23,4,61.5,55.0,3.95,3.98,2.43,38.20203
1,0.21,3,59.8,61.0,3.89,3.84,2.31,34.505856
2,0.23,1,56.9,65.0,4.05,4.07,2.31,38.076885
3,0.29,3,62.4,58.0,4.2,4.23,2.63,46.72458
4,0.31,1,63.3,58.0,4.34,4.35,2.75,51.91725


In [15]:
X = X.drop(columns = ["table"])
X.head()

Unnamed: 0,carat,cut,depth,x,y,z,Vol
0,0.23,4,61.5,3.95,3.98,2.43,38.20203
1,0.21,3,59.8,3.89,3.84,2.31,34.505856
2,0.23,1,56.9,4.05,4.07,2.31,38.076885
3,0.29,3,62.4,4.2,4.23,2.63,46.72458
4,0.31,1,63.3,4.34,4.35,2.75,51.91725


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [17]:
linereg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [18]:
y_hat = linereg.predict(X_test)
y_hat

array([7295.86377498, 1037.30168807, 2968.82954841, ..., 5517.06432009,
       5056.63322648,  992.38002669])

In [19]:
rmse = (mean_squared_error(y_test, y_hat))**(0.5)
rmse

1489.2519952936386

In [20]:
linereg.fit(X, y) # Una vez entrenado con los de entrenamiento, se entrenará con todos los datos

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [21]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [22]:
df2.cut = df2.cut.apply(lambda x: labeling(x, cut))
df2.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,0.33,4,H,VS1,61.8,56.0,4.41,4.46,2.74
1,1,0.41,4,G,VS1,61.2,57.0,4.77,4.8,2.93
2,2,0.35,4,G,IF,61.9,54.0,4.56,4.58,2.83
3,3,0.91,4,F,SI2,60.9,57.0,6.27,6.3,3.83
4,4,0.9,2,F,VS2,63.2,58.0,6.11,6.16,3.88


In [23]:
df2 = df2._get_numeric_data()
df2.head()

Unnamed: 0,id,carat,cut,depth,table,x,y,z
0,0,0.33,4,61.8,56.0,4.41,4.46,2.74
1,1,0.41,4,61.2,57.0,4.77,4.8,2.93
2,2,0.35,4,61.9,54.0,4.56,4.58,2.83
3,3,0.91,4,60.9,57.0,6.27,6.3,3.83
4,4,0.9,2,63.2,58.0,6.11,6.16,3.88


In [24]:
df2["Vol"] = df2.x*df2.y*df2.z
df2 = df2.drop(columns = ["table"])

In [25]:
df2.head()

Unnamed: 0,id,carat,cut,depth,x,y,z,Vol
0,0,0.33,4,61.8,4.41,4.46,2.74,53.891964
1,1,0.41,4,61.2,4.77,4.8,2.93,67.08528
2,2,0.35,4,61.9,4.56,4.58,2.83,59.103984
3,3,0.91,4,60.9,6.27,6.3,3.83,151.28883
4,4,0.9,2,63.2,6.11,6.16,3.88,146.033888


In [26]:
df2 = df2.drop(columns = ["id"])
df2.head()

Unnamed: 0,carat,cut,depth,x,y,z,Vol
0,0.33,4,61.8,4.41,4.46,2.74,53.891964
1,0.41,4,61.2,4.77,4.8,2.93,67.08528
2,0.35,4,61.9,4.56,4.58,2.83,59.103984
3,0.91,4,60.9,6.27,6.3,3.83,151.28883
4,0.9,2,63.2,6.11,6.16,3.88,146.033888


In [28]:
y_pred = linereg.predict(df2)
y_pred

array([ 818.15000322, 1283.69417903,  843.1913921 , ..., 5001.61708281,
        676.57927163,  211.82455901])

In [29]:
len(y_pred)

13485

In [30]:
submit = pd.DataFrame()
submit["id"] = [i for i in range(len(y_pred))]
submit["price"] = y_pred
submit.head()

Unnamed: 0,id,price
0,0,818.150003
1,1,1283.694179
2,2,843.191392
3,3,4792.434477
4,4,4151.929429


In [31]:
eje = pd.read_csv("submit_example.csv")
eje.head()

Unnamed: 0,id,price
0,0,707.61594
1,1,1126.893745
2,2,903.125222
3,3,4661.192597
4,4,4200.955304


In [32]:
submit.to_csv("submit.csv", index = False)