## Summary

***Start port forwarding to the ELASPIC webserver machine:***

```bash
ssh -NL 9009:localhost:9009 kimadmin@192.168.6.153
```

## Imports

In [None]:
import datetime
import json
import os
from pathlib import Path
from pprint import pprint

import kmtools
import kmtools.df_tools
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import pandas as pd
import requests
import sqlalchemy as sa
from dotenv import load_dotenv
from IPython.display import SVG, display

print2 = kmtools.df_tools.print2

In [None]:
load_dotenv("../.env.prod")

In [None]:
pd.set_option("display.max_columns", 300)
pd.options.mode.chained_assignment = None  # default='warn'

## Parameters

In [None]:
NOTEBOOK_DIR = Path("01_rerun_failed").resolve()
NOTEBOOK_DIR.mkdir(exist_ok=True)

NOTEBOOK_DIR

In [None]:
TIMESTAMP = datetime.datetime.utcnow().strftime("%Y-%m-%d")

TIMESTAMP

In [None]:
JOBSUBMITTER_URL = "http://localhost:8001/elaspic/api/1.0/"

In [None]:
DB_USER = os.environ["DB_USER"]
DB_PASSWORD = os.environ["DB_PASSWORD"]
DB_HOST = os.environ["DB_HOST"]
DB_PORT = os.environ["DB_PORT"]
DB_NAME = os.environ["DB_NAME"]

engine = sa.create_engine(f"mysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}")

## Load data

### local_jobs

In [None]:
sql_query = """
select distinct jobId, j.email job_email, j.dateRun date_run, localId, protein, m.mut, m.chain
from jobs j
join job_to_mut j2m on (j2m.job_id = j.jobID)
join muts m on (m.id = j2m.mut_id)
left join elaspic_core_model_local e ON (e.protein_id = localId)
WHERE localID is not NULL AND (status != 'done' or path_to_data = '' or path_to_data is null)
AND j.email is not NULL AND j.email != ''
"""
local_df = pd.read_sql_query(sql_query, engine)

In [None]:
display(local_df.head())
print(local_df.shape[0])

In [None]:
fg, ax = plt.subplots(figsize=(12, 3))
ax.hist(local_df["date_run"], bins=240, range=("2017", "2021"))
for label in ax.get_xticklabels():
    label.set_rotation(90)
ax.xaxis.set_major_locator(mdates.MonthLocator())
# ax.xaxis.set_major_formatter(mdates.DateFormatter('%d.%m.%y'))
ax.set_xlim("2017", "2021")

ax.set_ylabel("Number of mutations")
None

In [None]:
local_jobs = []
for (
    job_id,
    job_email,
), df_gp in local_df.groupby(["jobId", "job_email"]):
    df_gp["structure_file"] = "input.pdb"
    df_gp["mutations"] = df_gp[["chain", "mut"]].apply(
        lambda x: "{}_{}".format(int(x[0] if pd.notnull(x[0]) else 0) + 1, x[1]), axis=1
    )
    local_jobs.append(
        {
            "secret_key": "J6;u.950z5750Q#344vy7*idT1FBs0",
            "job_id": job_id,
            "job_email": job_email,
            "job_type": "local",
            "mutations": (
                df_gp.rename(columns={"protein": "protein_id"})[
                    ["protein_id", "mutations", "structure_file"]
                ].to_dict(orient="records")
            ),
        }
    )

# ---
pprint(local_jobs[:3])
print()
print2("Number of errored local mutations:", len(local_jobs))  # 37

In [None]:
local_output_file = NOTEBOOK_DIR.joinpath(f"local-jobs-{TIMESTAMP}.json")

with local_output_file.open("wt") as fout:
    json.dump(local_jobs, fout)
    
local_output_file

In [None]:
# for data_in in local_jobs:
#     r = requests.post(JOBSUBMITTER_URL, json=data_in)
#     if not r.ok:
#         print(f"Bad response from jobsubmitter server: {r}")
#         continue
#     status = r.json().get("status", None)
#     print(f"status: {status}")
#     print()

### database_jobs

In [None]:
sql_query = """
select distinct jobId, j.email job_email, j.dateRun date_run, protein, m.mut
from jobs j
join job_to_mut j2m on (j2m.job_id = j.jobID)
join muts m on (m.id = j2m.mut_id)
join elaspic.uniprot_domain ud ON (ud.uniprot_id = protein)
join elaspic.uniprot_domain_model udm USING (uniprot_domain_id)
WHERE localID is NULL AND status != 'done' AND elaspic.mutation_in_domain(m.mut, udm.model_domain_def)
AND j.email is not NULL AND j.email != ''
"""
database_df = pd.read_sql_query(sql_query, engine)

In [None]:
display(database_df.head())
print(database_df.shape[0])  # 3419

In [None]:
fg, ax = plt.subplots(figsize=(12, 3))
ax.hist(database_df["date_run"], bins=240, range=("2017", "2021"))
for label in ax.get_xticklabels():
    label.set_rotation(90)
ax.xaxis.set_major_locator(mdates.MonthLocator())
# ax.xaxis.set_major_formatter(mdates.DateFormatter('%d.%m.%y'))
ax.set_xlim("2017", "2021")

ax.set_ylabel("Number of mutations")

fg.subplots_adjust(0.07, 0.25, 0.98, 0.97)
None

In [None]:
database_jobs = []
for (job_id, job_email), df_gp in database_df.groupby(["jobId", "job_email"]):
    database_jobs.append(
        {
            "secret_key": "J6;u.950z5750Q#344vy7*idT1FBs0",
            "job_id": job_id,
            "job_email": job_email,
            "job_type": "database",
            "mutations": (
                df_gp.rename(columns={"protein": "protein_id", "mut": "mutations"})[
                    ["protein_id", "mutations"]
                ].to_dict(orient="records")
            ),
        }
    )

pprint(database_jobs[:3])
print()
print2("Number of errored database mutations:", len(database_jobs))  # 286

In [None]:
database_output_file = NOTEBOOK_DIR.joinpath(f"database-jobs-{TIMESTAMP}.json")

with database_output_file.open("wt") as fout:
    json.dump(database_jobs, fout)
    
database_output_file

In [None]:
[d for d in database_jobs if d["job_id"] in ["70b268", "7715a9"]]

In [None]:
# for data_in in database_jobs:
#     r = requests.post(JOBSUBMITTER_URL, json=data_in)
#     if not r.ok:
#         print(f"Bad response from jobsubmitter server: {r}")
#         continue
#     status = r.json().get("status", None)
#     print(f"status: {status}")
#     print()