## Summary

---

## Imports

In [None]:
from pathlib import Path

import pyarrow as pa
import pyarrow.parquet as pq
from tqdm.auto import tqdm

## Parameters

In [None]:
NOTEBOOK_DIR = Path("31_run_protbert").resolve()
# NOTEBOOK_DIR.mkdir(exist_ok=True)

NOTEBOOK_DIR

In [None]:
DATASET_NAME = "cagi6-sherloc"
DATASET_PATH = str(
    NOTEBOOK_DIR.parent.joinpath("30_cagi6_sherloc", "input-data-gby-protein.parquet")
)

# MODEL_NOTEBOOK_NAME = "31_run_proteinsolver"
# MODEL_NOTEBOOK_NAME = "31_run_protbert"
# MODEL_NOTEBOOK_NAME = "31_run_alphafold"
MODEL_NOTEBOOK_NAME = "31_run_rosetta_ddg"
# MODEL_NOTEBOOK_NAME = "31_run_msa_analysis"

DATASET_NAME, DATASET_PATH, MODEL_NOTEBOOK_NAME

In [None]:
TASK_COUNT = 4182

TASK_COUNT

## Workspace

In [None]:
if "msa" in MODEL_NOTEBOOK_NAME:
    prefix = "result"
else:
    prefix = "shard"

In [None]:
present_files = []
missing_files = []
for i in tqdm(range(1, TASK_COUNT + 1)):
    path = NOTEBOOK_DIR.parent.joinpath(
        MODEL_NOTEBOOK_NAME, DATASET_NAME, f"{prefix}-{i}-of-{TASK_COUNT}.parquet"
    )
    if i == 1098:
        print(path, path.is_file())
    if path.is_file():
        present_files.append((i, path))
    else:
        missing_files.append((i, path))

In [None]:
len(present_files), len(missing_files)

| model | finished | running |
| - | - | - |
| proteinsolver | 4182 | 0 |
| protbert | 4121 | 61 |
| alphafold | 2609 | 1573 (775) |
| rosetta | 2507 | 1675 |
| msa | 0 | 0 |

In [None]:
",".join([str(m[0]) for m in missing_files])

In [None]:
pfile = pq.ParquetFile(DATASET_PATH)

assert TASK_COUNT == pfile.num_row_groups

In [None]:
missing_uniprots = set()
missing_files_for_wt = []

for task_id, path in tqdm(missing_files):
    uniprot_id = (
        pfile.read_row_group(task_id - 1, columns=["protein_id"])
        .to_pandas()["protein_id"]
        .item()
    )
    if uniprot_id in missing_uniprots:
        continue
        
        
    path = NOTEBOOK_DIR.parent.joinpath(
        "31_run_alphafold_wt", DATASET_NAME, f"{prefix}-{i}-of-{TASK_COUNT}.parquet"
    )
    if path.is_file():
        missing_uniprots.add(uniprot_id)
        continue

    missing_files_for_wt.append((task_id, path))
    missing_uniprots.add(uniprot_id)

In [None]:
len(missing_files_for_wt)

In [None]:
",".join([f"{t[0]}" for t in missing_files_for_wt])

```bash
export NOTEBOOK_PATH="$(realpath 31_run_protbert.ipynb)"
export DATASET_NAME="cagi6-sherloc"
export DATASET_PATH="30_cagi6_sherloc/input-data-gby-protein.parquet"
export ORIGINAL_ARRAY_TASK_COUNT=4182

sbatch --export DATASET_NAME,DATASET_PATH,NOTEBOOK_PATH,ORIGINAL_ARRAY_TASK_COUNT --time 24:00:00 --ntasks-per-node=24 --mem=60G --array=5,20,30,68,93,94,97,202,263,266,281,284,294,306,309,350,359,391,409,435,465,473,486,535,582,616,658,693,704,705,706,739,756,861,951,975,992,1036,1038,1053,1098,1101,1106,1141,1191,1206,1223,1288,1322,1361,1364,1382,1399,1404,1427,1473,1502,1520,1555,1646,1667,1668,1691,1692,1718,1727,1731,1751,1758,1764,1842,1857,1917,1935,1964,1981,1985,2024,2036,2053,2054,2058,2103,2140,2143,2165,2175,2229,2318,2340,2353,2417,2433,2526,2643,2644,2684,2701,2742,2754,2767,2812,2823,2848,2858,2889,2907,2918,2949,2976,2981,2982,3021,3023,3027,3038,3070,3091,3108,3123,3140,3245,3264,3336,3352,3365,3388,3413,3510 ../scripts/run_notebook_cpu.sh
```



```bash
export NOTEBOOK_PATH="$(realpath 31_run_alphafold_wt.ipynb)"
export DATASET_NAME="cagi6-sherloc"
export DATASET_PATH="30_cagi6_sherloc/input-data-gby-protein.parquet"
export ORIGINAL_ARRAY_TASK_COUNT=4182

sbatch --export DATASET_NAME,DATASET_PATH,NOTEBOOK_PATH,ORIGINAL_ARRAY_TASK_COUNT --time 3:00:00 --gres=gpu:p100:1 --array=493,940,1098,1900,1902,1903,1905,1908,1909,1910,1911,1913,1914,1915,1919,1920,1922,1923,1924,1926,1927,1928,1933,1934,1935,1937,1939,1941,1942,1945,1947,1949,1950,1951,1952,1956,1959,1961,1964,1966,1970,1972,1982,1991,1999,2004,2012,2013,2014,2017,2021,2026,2027,2034,2042,2046,2054,2058,2064,2068,2071,2072,2073,2074,2078,2079,2081,2084,2085,2087,2089,2090,2095,2100,2101,2102,2103,2105,2114,2118,2119,2123,2127,2129,2132,2133,2134,2135,2137,2139,2143,2148,2150,2155,2160,2161,2162,2166,2181,2186,2188,2189,2191,2195,2205,2207,2215,2281,2283,2286,2297,2352,2355,2360,2362,2368,2373,2376,2378,2382,2385,2386,2397,2405,2406,2408,2411,2412,2413,2416,2417,2421,2422,2423,2425,2427,2428,2431,2435,2438,2439,2440,2442,2444,2445,2448,2450,2453,2454,2455,2456,2459,2460,2464,2465,2468,2470,2471,2472,2476,2477,2482,2483,2485,2487,2488,2490,2493,2495,2496,2498,2499,2501,2502,2504,2506,2507,2510,2511,2512,2515,2519,2521,2532,2533,2536,2537,2538,2542,2543,2545,2548,2550,2552,2554,2555,2557,2560,2562,2564,2566,2570,2572,2573,2574,2587,2588,2590,2593,2594,2596,2597,2599,2602,2604,2607,2608,2611,2616,2617,2618,2619,2621,2622,2627,2638,2645,2646,2648,2650,2651,2652,2653,2655,2656,2657,2659,2660,2661,2662,2665,2666,2669,2670,2671,2673,2674,2676,2678,2679,2680,2681,2682,2683,2684,2685,2686,2689,2690,2691,2692,2694,2701,2703,2704,2706,2708,2710,2711,2712,2714,2715,2716,2717,2718,2719,2720,2721,2724,2725,2726,2728,2729,2730,2731,2732,2733,2734,2736,2737,2738,2739,2740,2741,2742,2743,2744,2745,2746,2747,2753,2754,2755,2758,2759,2760,2761,2777,2778,2779,2780,2781,2782,2783,2784,2786,2787,2788,2789,2790,2791,2792,2798,2799 ../scripts/run_notebook_gpu.sh


export NOTEBOOK_PATH="$(realpath 31_run_alphafold_wt.ipynb)"
export DATASET_NAME="cagi6-sherloc"
export DATASET_PATH="30_cagi6_sherloc/input-data-gby-protein.parquet"
export ORIGINAL_ARRAY_TASK_COUNT=4182

sbatch --export DATASET_NAME,DATASET_PATH,NOTEBOOK_PATH,ORIGINAL_ARRAY_TASK_COUNT --time 3:00:00 --gres=gpu:v100l:1 --array=2801,2802,2803,2804,2805,2806,2807,2808,2810,2811,2812,2813,2814,2815,2818,2820,2821,2822,2823,2825,2827,2835,2836,2838,2839,2840,2841,2844,2845,2846,2847,2848,2849,2851,2854,2855,2856,2858,2862,2863,2864,2866,2867,2869,2872,2873,2876,2877,2878,2879,2880,2882,2884,2886,2891,2893,2894,2897,2899,2900,2901,2902,2904,2905,2908,2909,2912,2914,2915,2916,2917,2919,2920,2922,2923,2924,2925,2926,2928,2931,2933,2934,2936,2938,2940,2942,2943,2944,2947,2948,2949,2950,2952,2954,2955,2958,2959,2960,2962,2963,2965,2966,2967,2968,2970,2971,2972,2973,2974,2975,2976,2983,2984,2985,2986,2987,2996,3000,3003,3004,3012,3014,3017,3020,3023,3036,3039,3047,3051,3052,3054,3058,3064,3071,3072,3075,3078,3081,3082,3085,3096,3099,3105,3112,3118,3120,3123,3126,3143,3147,3152,3160,3178,3183,3189,3195,3202,3205,3208,3211,3212,3216,3220,3221,3223,3227,3230,3231,3233,3234,3237,3242,3244,3245,3249,3250,3251,3252,3253,3254,3256,3261,3262,3263,3265,3266,3267,3268,3271,3285,3289,3291,3294,3295,3296,3297,3299,3300,3301,3302,3303,3304,3307,3308,3311,3318,3321,3323,3326,3327,3328,3331,3333,3336,3337,3339,3340,3341,3345,3346,3349,3350,3354,3355,3356,3358,3359,3360,3361,3374,3377,3379,3383,3384,3387,3388,3392,3395,3399,3402,3404,3406,3407,3408,3409,3410,3412,3414,3416,3417,3441,3442,3443,3446,3447,3449,3451,3452,3454,3455,3456,3457,3466,3467,3468,3469,3472,3473,3474,3485,3486,3490,3491,3492,3503,3504,3505,3506,3507,3508,3521,3522,3525,3529,3532,3539,3542,3547,3551,3552,3556,3559,3562,3566,3567,3571,3572,3573,3578,3581,3584,3586,3587,3588,3592,3593,3596,3599 ../scripts/run_notebook_gpu.sh


export NOTEBOOK_PATH="$(realpath 31_run_alphafold_wt.ipynb)"
export DATASET_NAME="cagi6-sherloc"
export DATASET_PATH="30_cagi6_sherloc/input-data-gby-protein.parquet"
export ORIGINAL_ARRAY_TASK_COUNT=4182

sbatch --export DATASET_NAME,DATASET_PATH,NOTEBOOK_PATH,ORIGINAL_ARRAY_TASK_COUNT --time 72:00:00 --ntasks-per-node=8 --mem=40G --array=3903,3907,3911,3912,3913,3917,3920,3921,3925,3926,3927,3935,3936,3937,3938,3942,3943,3945,3949,3960,3961,3962,3971,3972,3973,3974,3981,3982,3988,3990,3991,3997,4002,4004,4005,4009,4010,4014,4015,4017,4018,4019,4020,4025,4030,4032,4033,4034,4035,4042,4043,4068,4069,4071,4072,4073,4075,4084,4085,4086,4088,4090,4091,4092,4093,4094,4095,4096,4097,4101,4108,4109,4113,4115,4119,4122,4128,4134,4135,4136,4137,4139,4144,4148,4155,4159,4161,4165,4166,4168,4169,4172,4177,4178,4181 ../scripts/run_notebook_cpu.sh

export NOTEBOOK_PATH="$(realpath 31_run_alphafold_wt.ipynb)"
export DATASET_NAME="cagi6-sherloc"
export DATASET_PATH="30_cagi6_sherloc/input-data-gby-protein.parquet"
export ORIGINAL_ARRAY_TASK_COUNT=4182

sbatch --export DATASET_NAME,DATASET_PATH,NOTEBOOK_PATH,ORIGINAL_ARRAY_TASK_COUNT --time 72:00:00 --ntasks-per-node=8 --mem=28G --array=3601,3602,3603,3604,3605,3608,3610,3611,3612,3614,3619,3621,3622,3624,3627,3628,3629,3630,3631,3635,3637,3638,3639,3640,3641,3642,3643,3644,3646,3647,3648,3651,3652,3653,3657,3658,3661,3662,3665,3678,3679,3680,3682,3693,3694,3695,3696,3697,3699,3700,3701,3703,3704,3708,3711,3715,3717,3718,3721,3722,3723,3724,3725,3726,3728,3729,3733,3734,3735,3736,3737,3738,3739,3740,3744,3747,3748,3750,3775,3777,3781,3782,3783,3786,3788,3792,3795,3801,3806,3811,3813,3817,3818,3819,3842,3843,3844,3848,3849,3850,3852,3854,3856,3857,3858,3862,3878,3879,3880,3890,3895,3896,3899 ../scripts/run_notebook_cpu.sh
```

In [None]:
",".join([str(m[0]) for m in missing_files_for_wt])

In [None]:
display(input_df.head(2))
print(len(input_df))