# Import Module and Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import functools
import os

%matplotlib inline

import SISSO_analysis.evaluation as evl
import SISSO_analysis.plot as plot
import SISSO_analysis.utils as utils
pd.options.display.max_columns, pd.options.display.max_rows = None,None

In [None]:
path=
cv_path=
ST=evl.Regression(path)
ST_cv=evl.RegressionCV(cv_path)

# Contents of This Report

In the following, there will be the results of:
1. Baseline
1. Training over whole data set:
    1. Descriptors
    1. Training errors
1. Cross validation results
    1. Training errors
    1. Prediction errors

In the supplementary information:
1. Whole data set
1. Features
1. Operation set
1. Feature importance

# Baseline

Baseline means the model that uses mean value as the prediction of every sample, i.e., prediction = mean(property).

In this model, RMSE is the standard deviation of property.

In [None]:
ST.baseline

In [None]:
plt.figure(figsize = (14, 7))
plot.baselineplot(ST,bins=50,marker_y=3,marker_shape=500)

# Training over whole data set

## Descriptors

In [None]:
ST.descriptors

Models:



## Training Errors

In [None]:
plt.figure(figsize = (10, 7))
plot.abs_errors_vs_dimension(ST,selected_errors=('RMSE',))
plt.plot([1,5],[ST.baseline[1],ST.baseline[1]],'--',label='Baseline')
plt.legend()

In [None]:
ST.total_errors(display_baseline=True)

# Cross Validation Results

When using SISSO_analysis.cross_validation to generate CV files, whole data set is divided into 2 files: 'train.dat' for training and 'validation.dat' for cross validation.

You should use SISSO to training each CV file and you will get model for each CV file.

Training errors are the average training error over all CV files, and training errors of each CV file are the training errors over corresponding 'train.dat'.

Prediction errors are the average prediction error over all CV files, and prediction errors of each CV file are the errors using model found by SISSO on the basis of 'train.dat' to predict property of samples in 'validation.dat'.

## Training Errors

In [None]:
plt.figure(figsize = (10, 7))
plot.abs_errors_vs_dimension(ST_cv,selected_errors=('RMSE',),label='rung = 2')
plt.plot([1,5],[ST.baseline[1],ST.baseline[1]],'--',label='Baseline')
plt.legend()

In [None]:
ST_cv.total_errors(display_baseline=True)

## Prediction Errors

In [None]:
ST_cv.total_errors(training=False,display_baseline=True)

In [None]:
plt.figure(figsize = (10, 7))
plot.abs_errors_vs_dimension(ST_cv,training=False,selected_errors=('RMSE',),label='rung = 2')
plt.plot([1,5],[ST.baseline[1],ST.baseline[1]],'--',label='Baseline')
plt.legend()

In [None]:
plt.figure(figsize=(20,30))
for i in range(1,6):
    plt.subplot(5,3,(i-1)*3+1)
    plot.error_hist(i,ST_cv,abs=False,training=False,rwidth=0.8)
    plt.subplot(5,3,(i-1)*3+2)
    plot.property_vs_prediction(i,ST_cv,training=False)
    plt.subplot(5,3,(i-1)*3+3)
    plot.hist_and_box_plot(i,ST_cv,training=False, bins=20, alpha=0.5, rwidth=0.8,marker_x=10)

In [None]:
plt.figure(figsize=(20,10))
plot.boxplot(ST_cv,training=False)

# Supplementary Information

## Whole Data Set

In [None]:
ST.data

## Features

In [None]:
ST.features_name

## Operation Set

In [None]:
ST.operation_set

## Feature Importance

In [None]:
ST.features_percent().T.sort_values('percent',ascending=False)