In [2]:
from sources.rescaling import standardizing_df
from sources.outliers import outliers_info, get_outliers_quantile
from sources.linear_model import linear_model, ridge_model, lasso_model
from sources.linear_metrics import linear_metrics, save_linear_metrics
from sources.feature_crossing import product_crossing
from sources.feature_selection import filter_categorical_feature, filter_numeric_feature
from sklearn.model_selection import train_test_split
import pandas as pd 
import numpy as np

In [3]:
path = '../diamonds-datamad1020/train.csv'
df_diamond = pd.read_csv(path, index_col=[0])

df_diamond.drop(axis=1, columns=['x','y','z'], inplace=True)

In [4]:
X = df_diamond.drop(axis=1, columns=['price'])
y = df_diamond['price']

## Separating the data in numerical and categorical values

In [5]:
X_numer = X.select_dtypes(include=[np.number])
X_categ = X.select_dtypes(exclude=[np.number])

## On numric data
- ### Standardizing!

In [6]:
X_numer_std = standardizing_df(X_numer)

## On categorical data
- ### get dummies

In [7]:
X_categ_dum = pd.get_dummies(X_categ)

## Rejoining the data, numerical and categorical

In [8]:
X = X_numer_std
X = X.join(X_categ_dum)

## Performing the feature cross!

In [9]:
X_synth = product_crossing(X)

---
# Let's split the data and train the models

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_synth, y, test_size=.20)

## linear_model

In [11]:
y_pred, y_pred_train = linear_model(X_train, X_test, y_train)
linear_metrics(y_test, y_pred, model='linear_model_2'), linear_metrics(y_train, y_pred_train, model='linear_model_2')

({'model': 'linear_model_2',
  'r2_score': 0.9657497253210807,
  'mean_squared_error': 545124.9205498548,
  'root_mean_squared_error': 738.3257550362541,
  'mean_absolute_error': 469.25739288534226},
 {'model': 'linear_model_2',
  'r2_score': 0.9623568814779923,
  'mean_squared_error': 606667.8152730138,
  'root_mean_squared_error': 778.8888337067195,
  'mean_absolute_error': 486.4582102687131})

## ridge_model

In [12]:
alphas = [0.001,0.01,0.1,0.25,0.5,0.75,1,2,5,10]
for a in alphas:
    y_pred, y_pred_train = ridge_model(X_train, X_test, y_train, alpha=a)
    print(a, linear_metrics(y_test, y_pred, model='ridge_model_2')['root_mean_squared_error'], linear_metrics(y_train, y_pred_train, model='ridge_model_2')['root_mean_squared_error'])

0.001 738.3260131976393 778.8888337258262
0.01 738.3283379141947 778.8888356158161
0.1 738.3517089444427 778.8890230619735
0.25 738.3911486018841 778.8900014205976
0.5 738.4581719328935 778.8934040445705
0.75 738.5267091048532 778.8989024861991
1 738.5966555216495 778.9063713439917
2 738.8887564253936 778.9538205284182
5 739.8490589834702 779.2236493208626
10 741.5921146367676 779.9289707482434


---
# Training for all dataset

In [13]:
X_pred = pd.read_csv('../diamonds-datamad1020/predict.csv', index_col=[0])
X_raw = pd.read_csv('../diamonds-datamad1020/predict.csv', index_col=[0])

X_pred.drop(axis=1, columns=['x', 'y', 'z'], inplace=True)

X_numer = X_pred.select_dtypes(include=[np.number])
X_categ = X_pred.select_dtypes(exclude=[np.number])

X_numer_std = standardizing_df(X_numer)
X_categ_dum = pd.get_dummies(X_categ)

X_pred = X_numer_std
X_pred = X_pred.join(X_categ_dum)

X_pred = product_crossing(X_pred)

y_pred, y_pred_train = linear_model(X_synth, X_pred, y)

In [19]:
solution = pd.DataFrame(y_pred, index=X_pred.index, columns=['price'])
solution.index.name = 'id'
# solution

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
0,1039.396470
1,6791.766082
2,362.630527
3,1833.805053
4,1116.203174
...,...
13480,603.128394
13481,3793.572613
13482,972.238406
13483,224.060792


In [20]:
# solution.to_csv('../outputs/test_2.csv')