In [21]:
import os
import pandas as pd

In [22]:
import wandb
from kaggle_secrets import UserSecretsClient

In [23]:
wkdir_prefix = "/kaggle/working"
aiml_dir = wkdir_prefix + "/aiml-thesis"
bertsum_src_dir = aiml_dir + "/submods/bertsum/src"
bertsum_dir = aiml_dir + "/submods/bertsum"
raw_dials_dir = aiml_dir + "/data/raw_dialogues"

In [24]:
os.chdir("/kaggle/working")

In [25]:
%env BERTSUM_DIR={bertsum_dir}

env: BERTSUM_DIR=/kaggle/working/aiml-thesis/submods/bertsum


In [20]:
!mkdir results models logs
!cp -r $BERTSUM_DIR/src .
!cp -r $BERTSUM_DIR/models/bert_transformer/* ./models

mkdir: cannot create directory 'results': File exists
mkdir: cannot create directory 'models': File exists
mkdir: cannot create directory 'logs': File exists
cp: cannot stat '/src': No such file or directory
cp: cannot stat '/models/bert_transformer/*': No such file or directory


In [None]:
def calc_metrics_df(df, rouge_only=True):
    df_test_results_lst = []
    for idx, row in df.iterrows():
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=True)
        rouge_scores = scorer.score(row['summary'], row['response'])
        rouges = dict()
        for k,v in rouge_scores.items():
            rouges[f"{k}_pr"] = round(v.precision, 4)
            rouges[f"{k}_re"] = round(v.recall, 4)
            rouges[f"{k}_f1"] = round(v.fmeasure, 4)
        if rouge_only == False:
            bert_scores = bertscore.compute(predictions=[row['response']], references=[row['summary']], lang="en")
            bert_scores.pop('hashcode')
            result = {
                **rouges,
                **{f"bertscore_{k[:2]}": round(v[0], 4) for k,v in bert_scores.items()},
                'meteor': round(meteor.compute(predictions=[row['response']], references=[row['summary']])['meteor'], 4),
            }
            row_res = {
                'conv_id': row['conv_id'],
                **result,
            }
           
        else:
            result = {
                **rouges,
            }
            row_res = {
                **result,
            }
        df_test_results_lst.append(row_res)
    return df_test_results_lst

In [13]:
def evaluate_rouge(split_name):
    for root, _, files in os.walk(bertsum_src_dir, topdown=True):
        file_data = []
        for results_file in files:
            if results_file.startswith("results_step") and results_file.endswith(".gold"):
                step_num = results_file[12:-5]
                cand_filename = results_file[:-5] + ".candidate"
                with open(os.path.join(root, results_file), 'r') as gold_file:
                    with open(os.path.join(root, cand_filename), 'r') as cand_file:
                        df = pd.DataFrame(
                            {
                                'summary': gold_file.readlines(),
                                'response': cand_file.readlines(),
                            }
                        )
                results_df = pd.DataFrame(calc_metrics_df(df))
                results_df.to_csv(f"./results/{split_name}_res_bertsum_s{step_num}_2408_1327.csv", index=False, header=True)
        print("Finished saving evaluation results")
        break

In [14]:
def test_model_checkpoints(split_name, models_dir='../models/bert_transformer'):
    %env SPLIT_NAME={split_name}
    for (root, _, files) in os.walk(models_dir):
        for filename in files:
            if filename.endswith(".pt"):
                step_num = filename[11:-3]
                %env TEST_FROM={os.path.join(root, filename)}
                !python train.py -mode test -test_from $TEST_FROM -bert_data_path ./bert_pt/bertsumdata_ind -dataset $SPLIT_NAME -model_path ../models/bert_transformer -visible_gpus 0 -gpu_ranks 0 -batch_size 30000 -log_file ../logs/bert_transformer_2408_1333_res -result_path ./results -test_all true -report_rouge false -block_trigram true
    evaluate_rouge("test")

In [None]:
os.chdir("./src")

In [None]:
!cp $BERTSUM_DIR/bert_config_uncased_base.json ../

In [16]:
!ls -la

total 24
drwxr-xr-x 6 root root 4096 Aug 28 16:34 .
drwxr-xr-x 5 root root 4096 Aug 28 16:31 ..
drwxr-xr-x 2 root root 4096 Aug 28 16:31 .virtual_documents
drwxr-xr-x 2 root root 4096 Aug 28 16:34 logs
drwxr-xr-x 2 root root 4096 Aug 28 16:34 models
drwxr-xr-x 2 root root 4096 Aug 28 16:34 results


In [None]:
test_model_checkpoints("train")
test_model_checkpoints("valid")

In [None]:
os.chdir("/kaggle/working")

In [2]:
results_dir = "/kaggle/input/bert-input-cont-train-2708-1442/results"

In [7]:
summaries_dir = "/kaggle/input/bert-input-cont-train-2708-1442"

In [8]:
%env RESULTS_DIR={results_dir}
%env SUMMARIES_DIR={summaries_dir}

env: RESULTS_DIR=/kaggle/input/bert-input-cont-train-2708-1442/results
env: SUMMARIES_DIR=/kaggle/input/bert-input-cont-train-2708-1442


In [9]:
!ls -la $SUMMARIES_DIR

total 1496
drwxr-xr-x  7 nobody nogroup      0 Aug 28 09:41 .
drwxr-xr-x  3 root   root      4096 Aug 28 09:43 ..
-rw-r--r--  1 nobody nogroup 130414 Aug 28 09:41 __notebook__.ipynb
-rw-r--r--  1 nobody nogroup 159038 Aug 28 09:41 __output__.json
-rw-r--r--  1 nobody nogroup 407397 Aug 28 09:41 __results__.html
-rw-r--r--  1 nobody nogroup    313 Aug 28 09:41 bert_config_uncased_base.json
-rw-r--r--  1 nobody nogroup      0 Aug 28 09:41 custom.css
drwxr-xr-x  2 nobody nogroup      0 Aug 28 09:41 logs
drwxr-xr-x  2 nobody nogroup      0 Aug 28 09:41 models
drwxr-xr-x  2 nobody nogroup      0 Aug 28 09:41 results
-rw-r--r--  1 nobody nogroup  32142 Aug 28 09:41 results_step10000.candidate
-rw-r--r--  1 nobody nogroup  46019 Aug 28 09:41 results_step10000.gold
-rw-r--r--  1 nobody nogroup  32757 Aug 28 09:41 results_step15000.candidate
-rw-r--r--  1 nobody nogroup  46019 Aug 28 09:41 results_step15000.gold
-rw-r--r--  1 nobody nogroup  32240 Aug 28 09:41 results_step20000.candidate
-rw-r-

In [11]:
!cd $SUMMARIES_DIR && zip /kaggle/working/bert_summaries_2808_1042.zip *.gold *.candidate

  adding: results_step10000.gold (deflated 62%)
  adding: results_step15000.gold (deflated 62%)
  adding: results_step20000.gold (deflated 62%)
  adding: results_step25000.gold (deflated 62%)
  adding: results_step30000.gold (deflated 62%)
  adding: results_step35000.gold (deflated 62%)
  adding: results_step40000.gold (deflated 62%)
  adding: results_step45000.gold (deflated 62%)
  adding: results_step5000.gold (deflated 62%)
  adding: results_step50000.gold (deflated 62%)
  adding: results_step10000.candidate (deflated 60%)
  adding: results_step15000.candidate (deflated 60%)
  adding: results_step20000.candidate (deflated 60%)
  adding: results_step25000.candidate (deflated 60%)
  adding: results_step30000.candidate (deflated 60%)
  adding: results_step35000.candidate (deflated 60%)
  adding: results_step40000.candidate (deflated 60%)
  adding: results_step45000.candidate (deflated 60%)
  adding: results_step5000.candidate (deflated 59%)
  adding: results_step50000.candidate (deflat

In [5]:
!zip -r bertsum_results_2808_1042.zip $RESULTS_DIR

  adding: kaggle/input/bert-input-cont-train-2708-1442/results/ (stored 0%)
  adding: kaggle/input/bert-input-cont-train-2708-1442/results/test_res_bertsum_s20000_2408_1930.csv (deflated 69%)
  adding: kaggle/input/bert-input-cont-train-2708-1442/results/test_res_bertsum_s45000_2408_1930.csv (deflated 70%)
  adding: kaggle/input/bert-input-cont-train-2708-1442/results/test_res_bertsum_s50000_2408_1930.csv (deflated 70%)
  adding: kaggle/input/bert-input-cont-train-2708-1442/results/test_res_bertsum_s5000_2408_1930.csv (deflated 70%)
  adding: kaggle/input/bert-input-cont-train-2708-1442/results/test_res_bertsum_s25000_2408_1930.csv (deflated 70%)
  adding: kaggle/input/bert-input-cont-train-2708-1442/results/test_res_bertsum_s10000_2408_1930.csv (deflated 69%)
  adding: kaggle/input/bert-input-cont-train-2708-1442/results/test_res_bertsum_s15000_2408_1930.csv (deflated 69%)
  adding: kaggle/input/bert-input-cont-train-2708-1442/results/test_res_bertsum_s35000_2408_1930.csv (deflated 69

In [24]:
user_secrets = UserSecretsClient()

In [25]:
os.environ["WANDB_PROJECT"] = "aiml-thesis-test"
wandb.login(key=user_secrets.get_secret("WANDB_API_KEY"))
run = wandb.init(settings=wandb.Settings(start_method="thread"))



VBox(children=(Label(value='0.330 MB of 0.337 MB uploaded\r'), FloatProgress(value=0.977638448181268, max=1.0)…

In [27]:
for root, _, files in os.walk(results_dir):
    for res_file in files:
        file_path = os.path.join(root,res_file)
        df = pd.read_csv(file_path)
        tbl = wandb.Table(dataframe=df)
        run.log({f'bertsum_test/{res_file[res_file.find("_s")+1:-4]}-table':tbl})

In [28]:
wandb.finish()

VBox(children=(Label(value='0.237 MB of 0.237 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))