# Import Module and Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import functools
import os

%matplotlib inline

import SISSOkit.evaluation as evl
import SISSOkit.plot as plot
import SISSOkit.utils as utils
pd.options.display.max_columns, pd.options.display.max_rows = None,None

In [None]:
path=
cv_path=
regression=evl.Regression(path)
regressionCV=evl.RegressionCV(cv_path)

# Contents of This Report

In the following, there will be the results of:
1. Baseline
1. Training over whole data set:
    1. Descriptors
    1. Training errors
1. Cross validation results
    1. Training errors
    1. Prediction errors

In the supplementary information:
1. Whole data set
1. Features
1. Operation set
1. Feature importance
1. Models

# Baseline

This cell contains table and histogram of baseline.

Baseline means the model that uses mean value as the prediction of every sample, i.e., prediction = mean(property).

In this model, RMSE is the standard deviation of property.

In [None]:
regression.baseline

In [None]:
plt.figure(figsize = (14, 7))
plot.baselineplot(regression,bins=50,marker_y=3,marker_shape=500)

# Training over whole data set

This cell contains the descriptors and training errors over whole data set.

## Descriptors

In [None]:
regression.descriptors

Descriptors:



## Training Errors

In [None]:
plt.figure(figsize = (10, 7))
plot.abs_errors_vs_dimension(regression,selected_errors=('RMSE',),display_baseline=True)

In [None]:
regression.total_errors(display_baseline=True)

# Cross Validation Results

This cell contains training errors and prediction errors of cross validation.

When using SISSO_analysis.cross_validation to generate CV files, whole data set is divided into 2 files: 'train.dat' for training and 'validation.dat' for cross validation.

You should use SISSO to training each CV file and you will get model for each CV file.

Training errors are the average training error over all CV files, and training errors of each CV file are the training errors over corresponding 'train.dat'.

Prediction errors are the average prediction error over all CV files, and prediction errors of each CV file are the errors using model found by SISSO on the basis of 'train.dat' to predict property of samples in 'validation.dat'.

## Training Errors

In [None]:
plt.figure(figsize = (10, 7))
plot.abs_errors_vs_dimension(regressionCV,selected_errors=('RMSE',),display_baseline=True)

In [None]:
regressionCV.total_errors(display_baseline=True)

In [None]:
plot.errors_details(regressionCV)

In [None]:
plt.figure(figsize=(20,10))
plot.boxplot(regressionCV)

## Prediction Errors

In [None]:
regressionCV.total_errors(training=False,display_baseline=True)

In [None]:
plt.figure(figsize = (10, 7))
plot.abs_errors_vs_dimension(regressionCV,training=False,selected_errors=('RMSE',),display_baseline=True)

In [None]:
plot.errors_details(regressionCV,training=False)

In [None]:
plt.figure(figsize=(20,10))
plot.boxplot(regressionCV,training=False)

# Supplementary Information

## Whole Data Set

In [None]:
regression.data

## Features

In [None]:
regression.features_name

## Operation Set

In [None]:
regression.operation_set

## Feature Importance

Feature percent is the percentage of each feature in top `sub_sis` 1D descriptors.

For example, if you set `sub_sis=100` in `SISSO.in`, and SISSO result shows that feature `A` appears in 30 descriptors out of the top 100 (`sub_sis`) features in `./feature_space/Uspace.name`, then the percentage of feature `A` is 30%.

For feature percent of cross validation results, it is the average percentage of each CV file.

In [None]:
regression.features_percent()

In [None]:
regressionCV.features_percent()

## Models

Models:
