<a href="https://colab.research.google.com/github/chemoinformatics-lecture/lecture-beginner/blob/main/lesson07_automl/lecture_note_regression_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 行うこと

これまでの授業の復習として以下のことを行ってください。
1. データをダウンロードする
2. Pubchem,RDKit,MOPACなどを用いて説明変数を作成する。
3. 機械学習を用いて、テストデータの予測を行う。
4. テストデータの予測値を課題提出場所に提出してください｡

# pycaretの関数

1. データの前処理：　setup()
2. モデルの比較： compare_models()
3. 分析モデルの生成： create_model()
4. チューニング： tune_model()
5. 可視化： plot_model()
6. 評価： evaluate_model()
7. 予測： finalize_model(), predict_model()

# pycaretのインストール

In [1]:
!pip install scikit-learn==0.23.2 --force-reinstall
!pip install pycaret==2.2.0
!pip install pandas-profiling==3.1.0
# pandas-profilingのversionを落とさないと次のエラーが出る。
# PyCaret ImportError: Missing optional dependency 'Jinja2'
# https://qiita.com/hasegatk/items/c835ba6243efbb4e53d2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-learn==0.23.2
  Using cached scikit_learn-0.23.2-cp37-cp37m-manylinux1_x86_64.whl (6.8 MB)
Collecting threadpoolctl>=2.0.0
  Using cached threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting joblib>=0.11
  Using cached joblib-1.2.0-py3-none-any.whl (297 kB)
Collecting scipy>=0.19.1
  Using cached scipy-1.7.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (38.1 MB)
Collecting numpy>=1.13.3
  Using cached numpy-1.21.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.7 MB)
Installing collected packages: numpy, threadpoolctl, scipy, joblib, scikit-learn
  Attempting uninstall: numpy
    Found existing installation: numpy 1.21.6
    Uninstalling numpy-1.21.6:
      Successfully uninstalled numpy-1.21.6
  Attempting uninstall: threadpoolctl
    Found existing installation: threadpoolctl 3.1.0
    Uninstalling threadpoolctl-3.1.0:
      Successfully u

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-learn>=0.23.2
  Downloading scikit_learn-1.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (24.8 MB)
[K     |████████████████████████████████| 24.8 MB 84.9 MB/s 
Collecting joblib
  Using cached joblib-1.0.1-py3-none-any.whl (303 kB)
Installing collected packages: joblib, scikit-learn
  Attempting uninstall: joblib
    Found existing installation: joblib 1.2.0
    Uninstalling joblib-1.2.0:
      Successfully uninstalled joblib-1.2.0
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.23.2
    Uninstalling scikit-learn-0.23.2:
      Successfully uninstalled scikit-learn-0.23.2
Successfully installed joblib-1.0.1 scikit-learn-1.0.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# 0. ライブラリーのインポート

In [2]:
import warnings
# 不要な警告文非表示
warnings.filterwarnings("ignore")


In [3]:
import pandas as pd
pd.__version__ # 1.3.5が表示

'1.3.5'

In [4]:
import pycaret
print(pycaret.__version__)  # 2.2.0が表示

2.2.0


In [5]:
import sklearn
print(sklearn.__version__) # 1.0.2が表示

1.0.2


# 1. データのセットアップ

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
import pandas as pd
dataset = pd.read_csv("/content/drive/MyDrive/data/soac/soac_example.csv")
dataset.head()

Unnamed: 0.1,Unnamed: 0,SMILES,_Name,MaxEStateIndex,MinEStateIndex,qed,MaxPartialCharge,MinPartialCharge,FpDensityMorgan1,FpDensityMorgan3,...,VSA_EState9,FractionCSP3,MolLogP,MolMR,formation_HEAT,HOMO,LUMO,LUMO-HOMO,DIPOLE,LN(SOAC)
0,0,O=C1C=CC(=O)C=C1,1_4_benzoquinone,10.282778,-0.120741,0.416681,0.178413,-0.289961,0.75,1.25,...,0.0,0.0,0.2506,28.294,-28.37194,-10.556,-2.002,8.554,0.00018,-1.07881
1,1,Cc1c(C)c2c(c(C)c1O)CC(C)(C)O2,2_3_dihydro_22467_pentamethylbenzofuran_5_ol,9.96595,-0.140787,0.706652,0.126677,-0.507381,1.066667,2.266667,...,0.0,0.538462,3.03096,60.7288,-101.68499,-8.047,0.352,8.399,2.57703,0.538997
2,2,COc1c(O)cc(C)c(O)c1OC,2_3_dimethoxy_5_methyl_1_4_benzenediol,9.517824,-0.04662,0.682324,0.206777,-0.504146,1.076923,2.076923,...,2.794444,0.333333,1.42342,47.6126,-147.22279,-8.498,-0.142,8.356,2.5089,-0.814508
3,3,Cc1c(O)ccc(O)c1C,2_3_dimethyl_1_4_benzenediol,9.113426,0.231759,0.535657,0.118517,-0.507672,0.9,1.5,...,0.0,0.25,1.71464,39.2456,-81.91756,-8.523,0.027,8.55,1.27077,-0.90446
4,4,CC(C)(C)c1cc(C(C)(C)C)c(O)c(C(C)(C)C)c1,2_4_6_tritertbutylphenol,10.650222,-0.050255,0.6737,0.122542,-0.507341,0.631579,1.157895,...,0.0,0.666667,5.2847,84.2068,-91.82528,-8.48,0.472,8.952,1.63362,-5.46383


In [8]:
from pycaret.regression import *
exp1 = setup(dataset, target = 'LN(SOAC)', silent=True, fold_shuffle=True, session_id=123)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,LN(SOAC)
2,Original Data,"(74, 65)"
3,Missing Values,False
4,Numeric Features,62
5,Categorical Features,2
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(51, 112)"


INFO:logs:create_model_container: 0
INFO:logs:master_model_container: 0
INFO:logs:display_container: 0


AttributeError: ignored

# pandas_profiling　重いです。

In [None]:
import pandas_profiling
profile = dataset.profile_report()

In [None]:
profile

In [None]:
profile.to_file("myoutputfile.html")
# このファイルのままだと、一時保存場所なので、時間が経つと消えます。
# ファイルの保存場所をdriveの中にすれば、google driveに保存される。

# 2. モデルの比較： compare_models()

In [None]:
compare_models()

In [None]:
plot_model('xgboost')

# 3. 分析モデルの生成： create_model()

In [None]:
lr = create_model('rf')

# 4. チューニング： tune_model()

In [None]:
tuned_lr = tune_model(lr)

# 5. 可視化： plot_model()

In [None]:
plot_model(tuned_lr)

# 6. 評価： evaluate_model()

In [None]:
evaluate_model(tuned_lr)

# 7. 予測： finalize_model(), predict_model()

In [None]:
final_lr = finalize_model(tuned_lr)
unseen_predictions = predict_model(final_lr)
unseen_predictions.head()

In [None]:
type(unseen_predictions)

In [None]:
unseen_predictions[['medv', 'Label']]

# 以上です。