In [2]:
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from pandas import read_csv, concat, DataFrame
from scipy.stats import pearsonr
from sklearn.decomposition import PCA
from sklearn.ensemble import StackingRegressor, RandomForestRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LinearRegression, RidgeCV, MultiTaskLasso
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor

from sklearn.tree import DecisionTreeRegressor

from src.config import CLEANED_PATH, DataType, UNCLEANED_PATH, RAW_FILE, INFERENCE_FILE
from src.logger import log


In [9]:
def add_missing_entries(source_path, target_path):
    s_df = read_csv(source_path, sep=',', usecols=['pair_id'])
    t_df = read_csv(target_path, sep=',', usecols=['pair_id', 'Overall'])
    print(s_df['pair_id'].unique().shape)
    print(t_df['pair_id'].unique().shape)
    # print(t_df.shape)
    # n_df =
    # print(n_df.shape)
    final_rows = []
    for row in s_df.itertuples(index=False):
        p_id = row[0]
        t_row = t_df.loc[t_df['pair_id'] == p_id]
        score = t_row.iloc[0, 1] if t_row.shape[0] >= 1 else random.choice([2, 3])
        final_rows.append([p_id, score])

    df = DataFrame(final_rows, columns=['pair_id', 'Overall'])
    df['Overall'] = df['Overall'].apply(clip_score)
    df.to_csv(target_path, mode='w', index=False)

    
def clip_score(val):
    if val > 4.0:
        return 4.0
    elif val < 1.0:
        return 1.0
    else:
        return val

In [4]:
train_path = CLEANED_PATH.format(data_type=DataType.train.name) + INFERENCE_FILE
test_path = CLEANED_PATH.format(data_type=DataType.test.name) + INFERENCE_FILE

cols = ['sentences_mean', 'sentences_min', 'sentences_max',
        'sentences_med', 'title', 'n1_title_n2_text',
        'n2_title_n1_text', 'n1_title_n1_text',
        'n2_title_n2_text', 'start_para', 'end_para',
        'ner', 'tf_idf', 'wmd_dist', 'overall']

# cols = ['tf_idf', 'wmd_dist', 'overall']

# cols = ['sentences_mean', 'sentences_min', 'sentences_max',
#         'sentences_med', 'title', 'n1_title_n2_text',
#         'n2_title_n1_text', 'n1_title_n1_text',
#         'n2_title_n2_text', 'start_para', 'end_para',
#         'wmd_dist', 'overall']


train_df = read_csv(train_path, sep=',', usecols=cols)
train_df = train_df.drop_duplicates()
y = train_df.pop('overall')
train_x, train_y = train_df, y

test_df = read_csv(test_path, sep=',', usecols=cols)
test_df = test_df.drop_duplicates()
val_y = test_df.pop('overall')
val_x = test_df



In [24]:
import wandb
sweep_config = {
  "name" : "kr-poly-paraall",
    "metric": {"name": "pearson", "goal": "maximize"},
  "method" : "bayes",
  "parameters" : {
    "kernel": {"values" :['poly']},
    "degree" : {"values" :[2, 3, 4]},
    "alpha": {
        "min": 0.05,
        "max": 1.0
    },
  }
}

sweep_id = wandb.sweep(sweep_config, project="SemEval-Task-8", entity="notsomonk")

Create sweep with ID: 7mjehf2z
Sweep URL: https://wandb.ai/notsomonk/SemEval-Task-8/sweeps/7mjehf2z


In [25]:
def train():
    with wandb.init() as run:
        
        train_path = CLEANED_PATH.format(data_type=DataType.train.name) + INFERENCE_FILE
        test_path = CLEANED_PATH.format(data_type=DataType.test.name) + INFERENCE_FILE

#         cols = ['sentences_mean', 'sentences_min', 'sentences_max',
#                 'sentences_med', 'title', 'n1_title_n2_text',
#                 'n2_title_n1_text', 'n1_title_n1_text',
#                 'n2_title_n2_text', 'start_para', 'end_para', 'wmd_dist',
#                 'overall']
        cols = ['start_para', 'end_para', 'overall']
        
        train_df = read_csv(train_path, sep=',', usecols=cols)
        train_df = train_df.drop_duplicates()
        y = train_df.pop('overall')
        train_x, train_y = train_df, y

        test_df = read_csv(test_path, sep=',', usecols=cols)
        test_df = test_df.drop_duplicates()
        val_y = test_df.pop('overall')
        val_x = test_df
        
        config = wandb.config
        
        r = KernelRidge(**config)
        r.fit(train_x, train_y)
        y_pred = r.predict(val_x)
        y_pred = np.vectorize(clip_score)(y_pred)
        p_s = pearsonr(y_pred, val_y)
        wandb.log({"pearson": p_s[0]})

count = 20
wandb.agent(sweep_id, function=train, count=count)

[34m[1mwandb[0m: Agent Starting Run: kswwx4rk with config:
[34m[1mwandb[0m: 	alpha: 0.1722602345233244
[34m[1mwandb[0m: 	degree: 3
[34m[1mwandb[0m: 	kernel: poly
[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
pearson,▁

0,1
pearson,0.63636


[34m[1mwandb[0m: Agent Starting Run: 1nv2wn1c with config:
[34m[1mwandb[0m: 	alpha: 0.3345315447543734
[34m[1mwandb[0m: 	degree: 2
[34m[1mwandb[0m: 	kernel: poly
[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
pearson,▁

0,1
pearson,0.63196


[34m[1mwandb[0m: Agent Starting Run: jeoxo8np with config:
[34m[1mwandb[0m: 	alpha: 0.6599309865666392
[34m[1mwandb[0m: 	degree: 2
[34m[1mwandb[0m: 	kernel: poly
[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
pearson,▁

0,1
pearson,0.63116


[34m[1mwandb[0m: Agent Starting Run: n50qh4hq with config:
[34m[1mwandb[0m: 	alpha: 0.2231879338420691
[34m[1mwandb[0m: 	degree: 3
[34m[1mwandb[0m: 	kernel: poly
[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
pearson,▁

0,1
pearson,0.63621


[34m[1mwandb[0m: Agent Starting Run: xx3pjt3c with config:
[34m[1mwandb[0m: 	alpha: 0.08171657266033877
[34m[1mwandb[0m: 	degree: 4
[34m[1mwandb[0m: 	kernel: poly
[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
pearson,▁

0,1
pearson,0.63722


[34m[1mwandb[0m: Agent Starting Run: rmhcju3k with config:
[34m[1mwandb[0m: 	alpha: 0.07488219699657869
[34m[1mwandb[0m: 	degree: 4
[34m[1mwandb[0m: 	kernel: poly
[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
pearson,▁

0,1
pearson,0.63723


[34m[1mwandb[0m: Agent Starting Run: 7q8nn040 with config:
[34m[1mwandb[0m: 	alpha: 0.08821160043489931
[34m[1mwandb[0m: 	degree: 4
[34m[1mwandb[0m: 	kernel: poly
[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
pearson,▁

0,1
pearson,0.63721


[34m[1mwandb[0m: Agent Starting Run: oa8q2lof with config:
[34m[1mwandb[0m: 	alpha: 0.1419197709575063
[34m[1mwandb[0m: 	degree: 4
[34m[1mwandb[0m: 	kernel: poly
[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
pearson,▁

0,1
pearson,0.63712


[34m[1mwandb[0m: Agent Starting Run: dterhg5w with config:
[34m[1mwandb[0m: 	alpha: 0.050208873259071245
[34m[1mwandb[0m: 	degree: 4
[34m[1mwandb[0m: 	kernel: poly
[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
pearson,▁

0,1
pearson,0.63727


[34m[1mwandb[0m: Agent Starting Run: t12eljew with config:
[34m[1mwandb[0m: 	alpha: 0.08216220297540605
[34m[1mwandb[0m: 	degree: 4
[34m[1mwandb[0m: 	kernel: poly
[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
pearson,▁

0,1
pearson,0.63722


[34m[1mwandb[0m: Agent Starting Run: xf7qkx2q with config:
[34m[1mwandb[0m: 	alpha: 0.06714877764300227
[34m[1mwandb[0m: 	degree: 4
[34m[1mwandb[0m: 	kernel: poly
[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
pearson,▁

0,1
pearson,0.63724


[34m[1mwandb[0m: Agent Starting Run: t5g4n2x0 with config:
[34m[1mwandb[0m: 	alpha: 0.185368633009365
[34m[1mwandb[0m: 	degree: 4
[34m[1mwandb[0m: 	kernel: poly
[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
pearson,▁

0,1
pearson,0.63708


[34m[1mwandb[0m: Agent Starting Run: u4lt6hgj with config:
[34m[1mwandb[0m: 	alpha: 0.10300098680606908
[34m[1mwandb[0m: 	degree: 4
[34m[1mwandb[0m: 	kernel: poly
[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
pearson,▁

0,1
pearson,0.63718


[34m[1mwandb[0m: Agent Starting Run: tpkqv0qk with config:
[34m[1mwandb[0m: 	alpha: 0.0571562391358595
[34m[1mwandb[0m: 	degree: 4
[34m[1mwandb[0m: 	kernel: poly
[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
pearson,▁

0,1
pearson,0.63726


[34m[1mwandb[0m: Agent Starting Run: zgeyi7c2 with config:
[34m[1mwandb[0m: 	alpha: 0.0949210248220363
[34m[1mwandb[0m: 	degree: 4
[34m[1mwandb[0m: 	kernel: poly
[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
pearson,▁

0,1
pearson,0.63719


[34m[1mwandb[0m: Agent Starting Run: gpth7a2n with config:
[34m[1mwandb[0m: 	alpha: 0.0870819433648389
[34m[1mwandb[0m: 	degree: 4
[34m[1mwandb[0m: 	kernel: poly
[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
pearson,▁

0,1
pearson,0.63721


[34m[1mwandb[0m: Agent Starting Run: tnoy1sso with config:
[34m[1mwandb[0m: 	alpha: 0.08507786318130203
[34m[1mwandb[0m: 	degree: 4
[34m[1mwandb[0m: 	kernel: poly
[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
pearson,▁

0,1
pearson,0.63721


[34m[1mwandb[0m: Agent Starting Run: 8l6z0oe3 with config:
[34m[1mwandb[0m: 	alpha: 0.07538490009176158
[34m[1mwandb[0m: 	degree: 4
[34m[1mwandb[0m: 	kernel: poly
[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
pearson,▁

0,1
pearson,0.63723


[34m[1mwandb[0m: Agent Starting Run: rvmertic with config:
[34m[1mwandb[0m: 	alpha: 0.17552812876606155
[34m[1mwandb[0m: 	degree: 4
[34m[1mwandb[0m: 	kernel: poly
[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
pearson,▁

0,1
pearson,0.63709


[34m[1mwandb[0m: Agent Starting Run: ihvdz0tz with config:
[34m[1mwandb[0m: 	alpha: 0.05561780682005138
[34m[1mwandb[0m: 	degree: 4
[34m[1mwandb[0m: 	kernel: poly
[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
pearson,▁

0,1
pearson,0.63726
