In [1]:
from helper import export_projects, dataset_metadata, _export, download_datasets, upload_dataset, _export, _convert_dictkeys_to_snake
import datarobot as dr
import shutil
import pandas as pd
import os
import json

In [2]:
#Need to manually create this; because our codespace will NOT create an empty directory during init.
#Try to create a one-word .gitkeep and see if we can create this dir for them.
DIR = "migrations"
client = dr.Client()
me = client.get("account/info/").json()



## 1. 既存環境のプロジェクト・データセットをエキスポート

In [4]:
#MissingSchemaエラーがあったら環境変数が未設定の可能性大
original_projects = export_projects()
original_catalog_metadata = dataset_metadata(original_projects)

In [5]:
print("合計プロジェクト：", len(original_projects))
print("移行者管理プロジェクト：", len([p for p in original_projects if p["catalogId"] is not None]))
print("-----------------------")

for p in original_projects:
    print(p["projectName"].ljust(100), " - ", p["id"], "- ", str(p["target"]).ljust(50), "-",  p["created"][0:10])

合計プロジェクト： 80
移行者管理プロジェクト： 49
skin_lesion.zip                                                                                       -  66ab2bbb9b4a61619f66e5d2 -  None                                               - 2024-08-01
与信リスクのデータ_train.xlsx                                                                                 -  66a331cd6796f9292f7859aa -  貸し倒れ                                               - 2024-07-26
fc-reactic-gb Project [4ed8432]                                                                       -  6699f8d6dfd7e0945fe91216 -  Sales (actual)                                     - 2024-07-19
fctestgb_jul19 Project [fa4bb88]                                                                      -  6699f773824e1d611ce917e9 -  Sales (actual)                                     - 2024-07-19
skin_lesion.zip                                                                                       -  6699c1882f700a81438cae3f -  dx                                                

## 2. 含みたいプロジェクトIDを以下のリストにて定義する

In [31]:

list_of_project_ids = ["66ab2bbb9b4a61619f66e5d2", "6699c1882f700a81438cae3f", "66a331cd6796f9292f7859aa","6699c1152f700a81438cadf2"]
proj_df = pd.DataFrame(original_projects)
filtered_df = proj_df.query("id in @list_of_project_ids")

In [32]:
##移行対照アイテムをファイルに出して保存する場合

# with open(f"{DIR}/original-projects.json", "w") as f:
#     json.dump(projects, f, ensure_ascii=False, indent=4)
# catalog_metadata_filtered.to_csv(f"{DIR}/original-datasets.csv", index=False)

## 3. 次のバッチにフィルターして、Codespaceの空き容量を確認

In [88]:
target_datasets = filtered_df.fileName.unique()
original_catalog_metadata.query("name in @target_datasets")

Unnamed: 0_level_0,catalogId,name,datasetSize(MB),rowCount,columnCount,creationDate,createdBy,isLatestVersion,processingState,categories
_dr_df_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,6699c0d42e7ae81174e529fd,skin_lesion.zip.csv,72,10015,7,2024-07-19T01:26:44.528000Z,Greig Bloom,True,COMPLETED,"[PREDICTION, TRAINING]"
1,6699c0afa59aaeecc4adbc92,与信リスクのデータ_train.xlsx.csv,11,50000,27,2024-07-19T01:26:07.626000Z,Greig Bloom,True,COMPLETED,"[TRAINING, PREDICTION]"
5,6699c085c79a0c6975d5bce9,cal_housing.geojson.csv,1,20640,9,2024-07-19T01:25:25.805000Z,Greig Bloom,True,COMPLETED,"[TRAINING, PREDICTION]"


In [90]:
#Filter because storage is limited to 20GB
next_batch_df = original_catalog_metadata.query("name in @target_datasets")[2:] #loc[[0,1,5]]
print(next_batch_df["name"].values)
print(next_batch_df["datasetSize(MB)"].sum())

['cal_housing.geojson.csv']
1


#### ストレージが次回ダウンロード分に対して十分な空き容量かの確認

In [91]:
raw = shutil.disk_usage(".")
free_mb = int(raw.free / (1_000_000))
print("空き容量: ", free_mb , "MB")
print("次回移行分に必要な空き容量確保できる：：", next_batch_df["datasetSize(MB)"].sum() < free_mb)


空き容量:  20896 MB
次回移行分に必要な空き容量確保できる：： True


## 4. データセットをCodespaceのストレージにダウンロード

In [92]:
downloaded_datasets = download_datasets(next_batch_df, DIR, 4)
#downloaded_datasets

All downloads completed


## 5. Codespaceのストレージにダウンロードしたデータセットを新規環境にアップロード

In [93]:
new_datasets = []
for cat in downloaded_datasets:
    record = {"name": cat["name"]}
    if cat["complete"]:
        resp = upload_dataset(cat, DIR=DIR)
        if "catalogId" in resp:
            record = record | resp
        new_datasets.append(record)

In [94]:
new_datasets

[{'name': 'cal_housing.geojson.csv',
  'statusId': 'd59076cd-4922-4982-9466-66b07b85eb09',
  'catalogId': '66ab3c4c2468f89d7dcc11c0',
  'catalogVersionId': '66ab3c4c2468f89d7dcc11c1'}]

## 6. 新しくアップロードしたデータセットのCatalogIDをプロジェクトデータに反映

In [95]:
next_batch_projects = []
next_batch_datasets = list(next_batch_df.name)
target_projects = [p for p in original_projects if p["fileName"] in next_batch_datasets]

for idx, p in enumerate(target_projects):
    if p["stage"] != "modeling":
        continue
    matched_file = [ds for ds in new_datasets if ds["name"] == p["fileName"]]
    if len(matched_file) == 1:
        new_record = target_projects[idx].copy()
        new_record["catalogId"] = matched_file[0]["catalogId"]
        next_batch_projects.append(new_record)
print(len(next_batch_projects))

1


#### Autopilot実行前にダウンロード完了（processingState)を確認

In [99]:
#新規環境に登録したデータセットのメタデータを取得
new_catalog = dataset_metadata(next_batch_projects,env="TARGET")
#データセット登録完了しているかを確認
new_catalog[["name","processingState"]]

Unnamed: 0,name,processingState
0,cal_housing.geojson.csv,COMPLETED


## 7. Project / Autopilot 実行開始

In [100]:
#AutopilotModeにインデックス１はない。
#https://docs.datarobot.com/en/docs/api/reference/public-api/projects.html
#mode_map = {0:"autopilot",2:"manual",3:"quick", 4:"comprehensive"}
adv_option_filter = ["downsampledMinorityRows", "downsampledMajorityRows","responseCap"]
TARGET_WORKERS = 8
print("こちらにてプロジェクトの進捗をご確認ください。")
print(f'{os.environ.get("DATAROBOT_ENDPOINT")[:-7]}/manage-projects')
for proj in next_batch_projects:
    print("プロジェクト開始： ", proj["projectName"])
    dr.Context.use_case = None
    new_proj = dr.Project.create_from_dataset(
        dataset_id=proj["catalogId"],
        project_name=proj["projectName"]	
        
    )
    #TypeError: AdvancedOptions.__init__()のエラーがスローされたら該当属性を以下の配列に追加する
    adv_options = {k:v for (k,v) in proj["advancedOptions"].items() if k not in adv_option_filter}
    adv_options = _convert_dictkeys_to_snake(adv_options)
    advanced_options = dr.AdvancedOptions(**adv_options)
    #partition_filter = ["datetimePartitionColumn", "useTimeSeries", "datetimeCol"]
    #partition = _convert_to_snake({k:v for (k,v) in proj["partition"].items() if k not in partition_filter})
    #partition_specs = None
    #if partition["cv_method"] == "datetime":
    #    print("This is a dateteime; instantiate object!")
    #    partition_specs = dr.DatetimePartitioningSpecification(**proj["partition"])
    #new_proj = new_proj.set_partitioning_method(**partition)
    
    try:
        new_proj = new_proj.analyze_and_model(
                    target=  proj["target"],
                    mode=    proj["autopilotMode"],
                
                    target_type=  proj["targetType"],
                    unsupervised_mode= proj["unsupervisedMode"],
                    unsupervised_type= proj["unsupervisedType"],
                    metric = proj["metric"],
                    positive_class = proj["positiveClass"],
                    advanced_options=advanced_options,
            	    worker_count=TARGET_WORKERS
            
                    #partitioning_method=part_specs
        #未確認アイテム        
        #holdout_unlocked = proj["holdoutUnlocked"],

        )
    except dr.errors.ClientError as e:
        print("エラー発生：, ", e)

こちらにてプロジェクトの進捗をご確認ください。
https://app.jp.datarobot.com/manage-projects
プロジェクト開始：  cal_housing.geojson


### 次回のバッチがある場合は移行済ファイルをStorageから削除した上

### ステップ３に戻り、次のバッチ範囲を決めて実行してください

In [87]:
for file in next_batch_datasets:
    os.remove(f"{DIR}/{file}")
