In [10]:
from sources.rescaling import standardizing_df
from sources.outliers import outliers_info, get_outliers_quantile
from sources.linear_model import linear_model, ridge_model, lasso_model
from sources.linear_metrics import linear_metrics, save_linear_metrics
from sources.feature_crossing import product_crossing
from sources.feature_selection import filter_categorical_feature, filter_numeric_feature
from sklearn.model_selection import train_test_split
import pandas as pd 
import numpy as np

In [11]:
path = '../diamonds-datamad1020/train.csv'
df_diamond = pd.read_csv(path, index_col=[0])

df_diamond.drop(axis=1, columns=['x','y','z'], inplace=True)

In [12]:
X = df_diamond.drop(axis=1, columns=['price'])
y = df_diamond['price']

## Separating the categorical values

In [13]:
X_categ = X.select_dtypes(exclude=[np.number])
X_numer = X.select_dtypes(include=[np.number])

In [14]:
X_categ_dum = pd.get_dummies(X_categ)

## Rejoining the data, numerical and categorical

In [15]:
X = X_numer
X = X.join(X_categ_dum)

---
# Let's split the data and train the models

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20)

## linear_model

In [17]:
y_pred, y_pred_train = linear_model(X_train, X_test, y_train)
linear_metrics(y_test, y_pred, model='linear_model_1'), linear_metrics(y_train, y_pred_train, model='linear_model_1')

({'model': 'linear_model_1',
  'r2_score': 0.9166761669603564,
  'mean_squared_error': 1335667.6461366513,
  'root_mean_squared_error': 1155.7108834551361,
  'mean_absolute_error': 805.234935496566},
 {'model': 'linear_model_1',
  'r2_score': 0.9163578524511223,
  'mean_squared_error': 1345686.2905530015,
  'root_mean_squared_error': 1160.0371936076021,
  'mean_absolute_error': 807.7658726406785})

## ridge_model

In [18]:
alphas = [0.001,0.01,0.1,0.25,0.5,0.75,1,2,5,10]
for a in alphas:
    y_pred, y_pred_train = ridge_model(X_train, X_test, y_train, alpha=a)
    print(a, linear_metrics(y_test, y_pred, model='ridge_model_1')['root_mean_squared_error'], linear_metrics(y_train, y_pred_train, model='ridge_model_2')['root_mean_squared_error'])

0.001 1155.7108383895331 1160.0371936085076
0.01 1155.7104328850874 1160.037193698128
0.1 1155.706386350706 1160.0372026577493
0.25 1155.6996764869823 1160.0372501456586
0.5 1155.688588695432 1160.0374195909242
0.75 1155.677619831287 1160.03770169057
1 1155.6667696460136 1160.0380961924654
2 1155.6245507460408 1160.0407932043122
5 1155.509106743041 1160.059491867841
10 1155.353159682823 1160.1250872810338


---
# Training for all dataset

In [21]:
X_pred = pd.read_csv('../diamonds-datamad1020/predict.csv', index_col=[0])

X_pred.drop(axis=1, columns=['x', 'y', 'z'], inplace=True)

X_numer = X_pred.select_dtypes(include=[np.number])
X_categ = X_pred.select_dtypes(exclude=[np.number])

X_categ_dum = pd.get_dummies(X_categ)

X_pred = X_numer
X_pred = X_pred.join(X_categ_dum)

y_pred, y_pred_train = linear_model(X, X_pred, y)

In [23]:
solution = pd.DataFrame(y_pred, index=X_pred.index, columns=['price'])
solution.index.name = 'id'
solution

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
0,845.034306
1,7176.182701
2,553.202888
3,2288.332250
4,1033.428498
...,...
13480,-1715.144562
13481,4522.510531
13482,1360.112266
13483,-87.742908


In [20]:
# solution.to_csv('../outputs/test_1.csv')