# Sentence Embeddings using Siamese BERT-Networks/Sentence Transformers
* Generic model, get embeddings, finetune, evaluate over card split

This version also combined white/black pairs and trains over a single column/text
* https://datascience.stackexchange.com/questions/39345/how-to-replace-a-part-string-value-of-a-column-using-another-column
* NOTE: Some cards lack a "____" - need to handle them 


In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

## Install Sentence Transformer Library

In [2]:
# # Install the library using pip
# !pip3 install sentence-transformers scikit-learn -U
# !pip3 install nltk -U

# import nltk
# nltk.download('punkt')

In [3]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split,GridSearchCV, cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, roc_auc_score  

# https://stackoverflow.com/questions/53784971/how-to-disable-convergencewarning-using-sklearn
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)
from sklearn.model_selection import train_test_split


In [4]:
from sentence_transformers import SentenceTransformer, LoggingHandler
from sentence_transformers import models, util, datasets, evaluation, losses, SentencesDataset ## https://www.sbert.net/docs/package_reference/losses.html + SentencesDataset
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, SentencesDataset, losses ## MultipleNegativesRankingLoss
## https://www.sbert.net/docs/package_reference/losses.html#multiplenegativesrankingloss
# from sentence_transformers.readers import InputExample
import logging
import os
import gzip
from torch.utils.data import DataLoader
from datetime import datetime
import sys

In [5]:
# model_name = "all-MiniLM-L6-v2"
model_name = "output/cah_tsdae-model"##"/content/drive/MyDrive/research/cah/cah_tsdae-model"
min_cooccurences = 3 # filter sentences for pairs that occurred at least k times. min 5: 200K. min 1: 1.9M

ONE_COL_DATA_FORMAT = True
USE_TEXT_COLS =  ["text","white_card_text"]#["black_card_text",	"white_card_text"]

## Load the sBERT Model

* Default , later try pretrained+ 

In [6]:
# Load the BERT model. Various models trained on Natural Language Inference (NLI) https://github.com/UKPLab/sentence-transformers/blob/master/docs/pretrained-models/nli-models.md and 
# Semantic Textual Similarity are available https://github.com/UKPLab/sentence-transformers/blob/master/docs/pretrained-models/sts-models.md

# model = SentenceTransformer('bert-base-nli-mean-tokens')
# model = SentenceTransformer("nli-distilroberta-base-v2")
## # https://www.sbert.net/docs/pretrained_models.html
# model = SentenceTransformer("paraphrase-MiniLM-L6-v2") ## "paraphrase-MiniLM-L12-v2"
# model = SentenceTransformer("paraphrase-MiniLM-L12-v2")
model = SentenceTransformer(model_name)
# model = SentenceTransformer("all-MiniLM-L12-v2")

## Setup a Corpus

* `/content/drive/MyDrive/Research/CAH/cah_train_min7_v1.csv` - 37K
    * `/content/drive/MyDrive/Research/CAH/cah_min7_v2.csv.gz`
* `/content/drive/MyDrive/Research/CAH/cah_train_min6_v1.csv`

In [7]:
## "/content/drive/MyDrive/Research/CAH/cah_min6_v2.csv.gz"
# "/content/drive/MyDrive/Research/CAH/cah_min2_v2.csv.gz"
# "/content/drive/MyDrive/Research/CAH/cah_train_min4_v1.csv"
# df = pd.read_csv("/content/drive/MyDrive/Research/CAH/cah_min6_v2.csv.gz",usecols=["black_card_text","white_card_text","picks"])#.sample(99)
# df = pd.read_csv("/content/drive/MyDrive/Research/CAH/cah_train_min4_v1.csv",usecols=["black_card_text","white_card_text","picks"])#.sample(99)

TRAIN_PATH = "cah_train_cardsplit_games.parquet" # "/content/drive/MyDrive/research/cah/cah_train_games.parquet"
df = pd.read_parquet(TRAIN_PATH).drop(columns=["prior_white"],errors="ignore") 

df["won"] = df["won"].astype(int)
df = df.sort_values(by="won",ascending=False) ## get picked pairs first

# df_all = df.copy() ## copy for quick eval
print(df.shape[0],"rows")

df["pair_count"] = df.groupby("text")["won"].transform("count") ## can be used to filter sentences occurring less than k times
print(df[["won","pair_count"]].describe().round(2) )

# df = df.drop_duplicates(subset=["black_card_text","white_card_text"],keep="first")#.sample(frac=1)
print(df.shape[0])

1008030 rows
              won  pair_count
count  1008030.00  1008030.00
mean         0.11        3.51
std          0.32        1.39
min          0.00        1.00
25%          0.00        3.00
50%          0.00        3.00
75%          0.00        4.00
max          1.00       13.00
1008030


Test set
* Keeps round level grouping

In [8]:
# "/content/drive/MyDrive/research/cah/cah_test_games.parquet"
df_test = pd.read_parquet("cah_test_cardsplit_games.parquet").drop(columns=["prior_white"],errors="ignore").sample(frac=1)

df_test["won"] = df_test["won"].astype(int)
print(df_test.nunique())
print(df_test.shape[0],"rows")
df_test

fake_round_id      129753
black_card_text       396
white_card_text      1960
won                     2
text               107390
sum_won                 8
dtype: int64
312663 rows


Unnamed: 0,fake_round_id,black_card_text,white_card_text,won,text,sum_won
2473311,247332,Get ready for the movie of the summer! One cop...,Not being a part of my son's life,0,Get ready for the movie of the summer! One cop...,2
653395,65340,It's beginning to look a lot like __.,Going to prom with a 108-year-old vampire,0,It's beginning to look a lot like Going to pro...,0
1933200,193321,"Hey baby, come back to my place and I'll show ...",A positive male role model,0,"Hey baby, come back to my place and I'll show ...",0
907963,90797,"I'm just gonna stay in tonight. You know, Netf...",Filling a man's anus with concrete,0,"I'm just gonna stay in tonight. You know, Netf...",0
306475,30648,"Well what do you have to say for yourself, Cas...",A horse with no legs,0,"Well what do you have to say for yourself, Cas...",0
...,...,...,...,...,...,...
2231342,223135,I have solved politics. My solution is __.,Falling into a pit of waffles,0,I have solved politics. My solution is Falling...,1
2076273,207628,This is the way the world ends This is the way...,Chunky highlights,0,This is the way the world ends This is the way...,1
406780,40679,A study published in Nature this week found th...,Mooing,0,A study published in Nature this week found th...,0
1169372,116938,"Up next on Nickelodeon: ""Clarissa Explains __.""",Poopy diapers,0,"Up next on Nickelodeon: ""Clarissa Explains Poo...",1


### check mean baseline prior
* By min cooccurrences of sentence pairs in in raw data
1 min occ Prior Acc: 0.2044
2 min occ Prior Acc: 0.2032
3 min occ Prior Acc: 0.2027
4 min occ Prior Acc: 0.2011
5 min occ Prior Acc: 0.1922
6 min occ Prior Acc: 0.1762
7 min occ Prior Acc: 0.1503

In [9]:
# for i in range(1,9):
#   df_temp = df.loc[df["pair_count"] >=i]
#   df_white_prior = df_temp.groupby(["white_card_text"], as_index=False)["won"].mean().rename(columns={"won":"white_prior"}).set_index("white_card_text")
#   df_test = df_test.join(df_white_prior,on="white_card_text",how="left")
#   prior = df_test["white_prior"].mean()
#   df_test["white_prior"] = df_test["white_prior"].fillna(prior)
#   print(f"{i} min occ Prior Acc: %.3f , {df_temp.shape[0]} rows" %df_test.sort_values("white_prior",ascending=False).groupby("fake_round_id").head(1)["won"].mean()) ## 17.6% (with min 6) , 17.6% with min 3
#   df_test.drop(columns=["white_prior"],errors="ignore",inplace=True)

for i in range(1,6):
    df_temp = df.loc[df["pair_count"] >=i].copy()
    print(f"{i} min occ, {df_temp.shape[0]} rows")
    df_white_prior = df_temp.groupby(["white_card_text"], as_index=False)["won"].mean().rename(columns={"won":"white_prior"}).set_index("white_card_text")
    df_test = df_test.join(df_white_prior,on="white_card_text",how="left")
    prior = df_test["white_prior"].mean()
    df_test["white_prior"] = df_test["white_prior"].fillna(prior)
    print("White Prior Acc: %.3f" %df_test.sort_values("white_prior",ascending=False).groupby("fake_round_id").head(1)["won"].mean()) ## 17.6% (with min 6) , 17.6% with min 3

    ## prior for a black-white combination - mean (freq% won), or sum (times won?) , or threshholded max? (over 2 times?)
    df_pair_prior = df_temp.groupby(["white_card_text","black_card_text"], as_index=False)["won"].sum().rename(columns={"won":"pair_prior"}).set_index(["white_card_text","black_card_text"])

    df_test = df_test.join(df_pair_prior,on=["white_card_text","black_card_text"],how="left")
    prior = df_test["pair_prior"].mean()
    df_test["pair_prior"] = df_test["pair_prior"].fillna(prior)
    print("Pair Prior (Only)  Acc: %.3f" %df_test.sort_values("pair_prior",ascending=False).groupby("fake_round_id").head(1)["won"].mean()) ## 17.6% (with min 6) , 17.6% with min 3
    print("White then Pair Prior Acc: %.3f" %df_test.sort_values(["white_prior","pair_prior"],ascending=False).groupby("fake_round_id").head(1)["won"].mean()) ## 17.6% (with min 6) , 17.6% with min 3
    print("Pair Prior then White Acc: %.3f" %df_test.sort_values(["pair_prior","white_prior",],ascending=False).groupby("fake_round_id").head(1)["won"].mean()) ## 17.6% (with min 6) , 17.6% with min 3

    df_test.drop(columns=["white_prior","pair_prior"],errors="ignore",inplace=True)

1 min occ, 1008030 rows
White Prior Acc: 0.122
Pair Prior (Only)  Acc: 0.116
White then Pair Prior Acc: 0.121
Pair Prior then White Acc: 0.121
2 min occ, 966844 rows
White Prior Acc: 0.122
Pair Prior (Only)  Acc: 0.116
White then Pair Prior Acc: 0.121
Pair Prior then White Acc: 0.121
3 min occ, 771250 rows
White Prior Acc: 0.121
Pair Prior (Only)  Acc: 0.116
White then Pair Prior Acc: 0.121
Pair Prior then White Acc: 0.121
4 min occ, 445801 rows
White Prior Acc: 0.122
Pair Prior (Only)  Acc: 0.116
White then Pair Prior Acc: 0.121
Pair Prior then White Acc: 0.121
5 min occ, 212557 rows
White Prior Acc: 0.121
Pair Prior (Only)  Acc: 0.116
White then Pair Prior Acc: 0.121
Pair Prior then White Acc: 0.121


Drop duplicate instances with same out put (i.e ignore round level/ranking) 
* keep positives preferrably
* Could do : weight or filter bby # occurrences

* 22% mean win rate after this (instea of 10%)

In [10]:
## filter for sentence pairs occurring at least X times, regardless of label
df = df.loc[df["pair_count"] >=min_cooccurences]
print(df.nunique())
print(df.shape[0],f"rows after {min_cooccurences} filter of pairs")
df = df.drop(columns=["fake_round_id","prior_white","pair_count"],errors="ignore") # drop round id if not doing group level ranking
df = df.sort_values(by="won",ascending=False).drop_duplicates(subset=["black_card_text","white_card_text"],keep="first").sample(frac=1)
print("mean won:",df["won"].mean())
print(df.nunique())
print(df.shape[0],"rows")
df

fake_round_id      142659
black_card_text       377
white_card_text      1571
won                     2
text               205520
sum_won                 9
pair_count             11
dtype: int64
771250 rows after 3 filter of pairs
mean won: 0.331758466329311
black_card_text       377
white_card_text      1571
won                     2
text               205520
sum_won                 9
dtype: int64
205520 rows


Unnamed: 0,black_card_text,white_card_text,won,text,sum_won
2535065,What never fails to liven up the party?,Whispering all sexy,0,What never fails to liven up the party? Whispe...,0
2712581,What's making things awkward in the sauna?,Gay conversion therapy,0,What's making things awkward in the sauna? Gay...,0
1593710,You love Black Friday. You love Cyber Monday. ...,Blackface,1,You love Black Friday. You love Cyber Monday. ...,3
2896899,"I've had a horrible vision, father. I saw moun...",Sucking the caviar straight out of a fish's pussy,0,"I've had a horrible vision, father. I saw moun...",0
301071,What's fun until it gets weird?,Barely legal boys,1,What's fun until it gets weird? Barely legal boys,1
...,...,...,...,...,...
1045990,Computer! Display __ on screen. Enhance.,Smashing my balls at the moment of climax,1,Computer! Display Smashing my balls at the mom...,1
2738538,"Sure, sex is great, but have you tried __?",Homework,0,"Sure, sex is great, but have you tried Homework?",0
2886179,Say it loud! I'm __ and I'm proud!,The human body,0,Say it loud! I'm The human body and I'm proud!,0
2600947,And what did you bring for show and tell?,"A hit new fantasy show called ""Penis Man.""",0,And what did you bring for show and tell? A hi...,0


In [11]:
# import csv
# df[[#'black_card_text',
#     # 'white_card_text',
#   'text','won']].rename(columns={'won':"target"}).to_csv(f"cah_min{min_cooccurences}.csv.gz",compression="gzip",index=False,quoting=csv.QUOTE_ALL)

#### train - eval split (if doing supervised pretraining... )

##### Data formats: 
1. 2 text cols
  * Can be white, black
  * could be merged (`text`) and white/black,
2. single joint text col
  * Merged (`text`) col

In [12]:
# df = df.sample(200)

In [13]:
# y = df["won"].values
# X = np.concatenate([s1_emb,s2_emb],axis=1) # BOTH 
# X = list(df["text"].values)

if ONE_COL_DATA_FORMAT:
  ### 1 text col version
    X_train, X_test, y_train, y_test = train_test_split(
    list(df["text"].values), list(df["won"].astype(int).values), test_size=0.1, random_state=42)

#     train_samples=  [InputExample(texts=[X_train[i]],label=float(y_train[i])) for i in range(len(X_train))]

# else:
#   ### 2 text col version ; can try different cols
#   X_train, X_test, y_train, y_test = train_test_split(
#     list(df[USE_TEXT_COLS].values), list(df["won"].values), test_size=0.1, random_state=42)
  
# #   train_samples=  [InputExample(texts=[X_train[i][0],X_train[i][1]],label=float(y_train[i])) for i in range(len(X_train))]
#   # test_samples=  [InputExample(texts=[X_test[i]][0],X_test[i]][1]],label=float(y_test[i])) for i in range(len(X_test))]
#   ### binary evaluator: expect list 1, list 2:
#   ## https://www.sbert.net/docs/package_reference/evaluation.html
# #   dev_eval = evaluation.BinaryClassificationEvaluator(sentences1=[x[0] for x in X_test],sentences2 = [x[1] for x in X_test],labels = y_test,
# #                                                                           batch_size=128,show_progress_bar=True,write_csv=True)
# # train_examples =  [InputExample(texts=[X_train[i]],label=float(y_train[i])) for i in range(len(X_train))]
# # train_examples = X_train

* See how well unsupervised model does? 
* Check if funny combs are close or far

In [14]:
print(len(X_train),"tr")
print(len(X_test),"test")

184968 tr
20552 test


In [15]:
%%time
#X_test_emb
X_test = model.encode(X_test,
                      normalize_embeddings=False,
                      show_progress_bar=True,
                      batch_size=64
                    #   convert_to_tensor=True
                    )


X_train = model.encode(X_train,
                      normalize_embeddings=False,
                      show_progress_bar=True,
                      batch_size=64
                    #   convert_to_tensor=True
                    )

Batches:   0%|          | 0/322 [00:00<?, ?it/s]

Batches:   0%|          | 0/2891 [00:00<?, ?it/s]

Wall time: 37min 37s


In [16]:
%%time
# StandardScaler
clf = make_pipeline(StandardScaler(),  LogisticRegression(solver="sag")) ## PCA(n_components=2), # error with array? 
# clf = LogisticRegression()
# cv_preds = cross_val_predict(clf,X,y,n_jobs=-2)

cv_preds = cross_val_predict(clf,X_train,y_train,method="predict_proba",cv=3,n_jobs=-2)[:,1]
print(classification_report(y_true=y_train,y_pred=cv_preds>0.5))
print("roc_auc %.4f" %roc_auc_score(y_true=y_train,y_score=cv_preds))


# #### min 4 (joint text), v1, 371K records
#               precision    recall  f1-score   support

#            0       0.65      0.90      0.75    227115
#            1       0.58      0.22      0.32    144299

#     accuracy                           0.64    371414
#    macro avg       0.61      0.56      0.54    371414
# weighted avg       0.62      0.64      0.58    371414

# roc_auc 0.620


#               precision    recall  f1-score   support

#            0       0.63      0.78      0.70     50158
#            1       0.57      0.38      0.46     37673

#     accuracy                           0.61     87831
#    macro avg       0.60      0.58      0.58     87831
# weighted avg       0.60      0.61      0.59     87831

# roc_auc 0.62310
# Wall time: 6min 21s

### min 6, with delta instead of white card features: same results as above

# ### min 6 + with some delta, diff, and B+W + cosine sim[0]

## min 6 (Above) + nli-distilroberta-base-v2:
## roc_auc 0.634

# ### min 7
# # ## with some delta, diff, and B+W + cosine sim[0]
#               precision    recall  f1-score   support

#            0       0.60      0.68      0.64     20290
#            1       0.57      0.48      0.52     17670

#     accuracy                           0.59     37960
#    macro avg       0.59      0.58      0.58     37960
# weighted avg       0.59      0.59      0.59     37960

# roc_auc 0.6182

              precision    recall  f1-score   support

           0       0.67      0.97      0.80    123485
           1       0.52      0.06      0.11     61483

    accuracy                           0.67    184968
   macro avg       0.60      0.52      0.45    184968
weighted avg       0.62      0.67      0.57    184968

roc_auc 0.5954
Wall time: 50.7 s


In [17]:
%%time
## slow... ? 
clf_rf = RandomForestClassifier(n_jobs=-2,n_estimators=200,
    max_depth=14,    min_samples_split=3,    min_samples_leaf=2,ccp_alpha=0.05,class_weight="balanced")

cv_preds_2 = cross_val_predict(clf_rf,X_train,y_train,method="predict_proba",cv=3)[:,1]
print(classification_report(y_true=y_train,y_pred=cv_preds_2>0.5))
print("roc_auc %.4f" %roc_auc_score(y_true=y_train,y_score=cv_preds_2))

# # #### min 4 (joint text), v1, 371K records
#               precision    recall  f1-score   support

#            0       0.63      0.94      0.76    227115
#            1       0.61      0.14      0.23    144299

#     accuracy                           0.63    371414
#    macro avg       0.62      0.54      0.49    371414
# weighted avg       0.62      0.63      0.55    371414

# roc_auc 0.609
######
# ## 10 ,min on min6/80k
#               precision    recall  f1-score   support

#            0       0.64      0.78      0.70     50158
#            1       0.59      0.42      0.49     37673

#     accuracy                           0.63     87831
#    macro avg       0.62      0.60      0.60     87831
# weighted avg       0.62      0.63      0.61     87831

# roc_auc 0.65095
# Wall time: 9min 52s

### min 6, with delta instead of white card features: 
#               precision    recall  f1-score   support

#            0       0.64      0.76      0.70     50158
#            1       0.58      0.43      0.50     37673

#     accuracy                           0.62     87831
#    macro avg       0.61      0.60      0.60     87831
# weighted avg       0.62      0.62      0.61     87831

# roc_auc 0.64848
# Wall time: 14min 5s

# # ### min 6 + with some delta, diff, and B+W + cosine sim[0]
#               precision    recall  f1-score   support

#            0       0.64      0.77      0.70     50001
#            1       0.59      0.43      0.49     37830

#     accuracy                           0.62     87831
#    macro avg       0.61      0.60      0.60     87831
# weighted avg       0.62      0.62      0.61     87831

# roc_auc 0.6518

## min 6 (Above) + nli-distilroberta-base-v2:
#               precision    recall  f1-score   support

#            0       0.64      0.78      0.70     50001
#            1       0.59      0.43      0.49     37830

#     accuracy                           0.62     87831
#    macro avg       0.62      0.60      0.60     87831
# weighted avg       0.62      0.62      0.61     87831

# roc_auc 0.6520

# ### min 7
# ## with some delta, diff, and B+W + cosine sim[0]
#               precision    recall  f1-score   support

#            0       0.61      0.71      0.66     20290
#            1       0.59      0.48      0.53     17670

#     accuracy                           0.60     37960
#    macro avg       0.60      0.60      0.59     37960
# weighted avg       0.60      0.60      0.60     37960
# roc_auc 0.634



##TODO - feature importance (do the deltas improve model?)

# min 2 , L12 miniLM
#               precision    recall  f1-score   support

#            0       0.73      0.97      0.83    469455
#            1       0.53      0.10      0.17    183561

#     accuracy                           0.72    653016
#    macro avg       0.63      0.53      0.50    653016
# weighted avg       0.68      0.72      0.65    653016

# roc_auc 0.64871
# CPU times: user 2h 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.67      1.00      0.80    123485
           1       0.00      0.00      0.00     61483

    accuracy                           0.67    184968
   macro avg       0.33      0.50      0.40    184968
weighted avg       0.45      0.67      0.53    184968

roc_auc 0.5000
Wall time: 5min 10s


## Evaluate on true held out set
* Disjoint by cards

In [18]:
y_eval = list(df_test["won"].astype(int).values)
print("mean eval won: %.3f" %np.mean(y_eval))

mean eval won: 0.116


In [19]:
%%time
## 48 min
X_eval = model.encode(list(df_test["text"].values), 
                      normalize_embeddings=False,
                      show_progress_bar=True,
                      batch_size=64
                    #   convert_to_tensor=True
                    )

Batches:   0%|          | 0/4886 [00:00<?, ?it/s]

Wall time: 48min 29s


In [26]:
%%time
# clf = clf_rf
clf = make_pipeline(StandardScaler(),  LogisticRegression(solver="sag",class_weight="balanced"))
clf.fit(X_train,y_train)
eval_preds = clf.predict_proba(X_eval)[:,1]
print(classification_report(y_true=y_train,y_pred=cv_preds_2>0.5))
print("roc_auc %.4f" % roc_auc_score(y_true=y_train,y_score=cv_preds_2))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.67      1.00      0.80    123485
           1       0.00      0.00      0.00     61483

    accuracy                           0.67    184968
   macro avg       0.33      0.50      0.40    184968
weighted avg       0.45      0.67      0.53    184968

roc_auc 0.5000
Wall time: 1min 4s


In [27]:
%%time
# clf = clf_rf
clf = LogisticRegressionCV(solver="sag",class_weight="balanced",n_jobs=-2)
clf.fit(X_train,y_train)
eval_preds = clf.predict_proba(X_eval)[:,1]
print(classification_report(y_true=y_train,y_pred=cv_preds_2>0.5))
print("roc_auc %.4f" % roc_auc_score(y_true=y_train,y_score=cv_preds_2))

              precision    recall  f1-score   support

           0       0.67      1.00      0.80    123485
           1       0.00      0.00      0.00     61483

    accuracy                           0.67    184968
   macro avg       0.33      0.50      0.40    184968
weighted avg       0.45      0.67      0.53    184968

roc_auc 0.5000
Wall time: 5min 50s


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
df_test["preds"] = eval_preds
print(df_test.sort_values(["preds"],ascending=False).groupby("fake_round_id").head(1)["won"].mean()) ## 0.134

0.1345864835495133


In [30]:
# clf = clf_rf
# clf.fit(X_train,y_train)
# eval_preds = clf.predict_proba(X_eval)[:,1]
# df_test["preds"] = eval_preds
# print(df_test.sort_values(["preds"],ascending=False).groupby("fake_round_id").head(1)["won"].mean()) ## 0.115

0.1345864835495133


0.11578923030681372


In [21]:
# %%time
# ### could speed this up n^2 by getting embeddings only for unique combniations, then rejoining/merging...  (Won't work if using "Text" col)
# s1_emb = model.encode(list(df[USE_TEXT_COLS[0]].values),
#                       normalize_embeddings=True,
#                       show_progress_bar=True,
#                       batch_size=128
#                     #   convert_to_tensor=True
#                     )

# ## black cards = quesiton/prompt
# s2_emb = model.encode(list(df[USE_TEXT_COLS[1]].values),
#                       normalize_embeddings=True,
#                       show_progress_bar=True,
#                       batch_size=256
#                     #   convert_to_tensor=True
#                     )#.astype(np.float32) ## 1.5 min for 4K , with L12 miniLM, on cpu

# dot_sim = []
# for i in range (len(s1_emb)):
#     dot_sim.append(float(util.dot_score(s1_emb[i], s2_emb[i])[0]))
# print(len(dot_sim))

# df["dot_sim_score"] = dot_sim
# # df_all["dot_sim_score"] = dot_sim
# print(df[["dot_sim_score","won"]].corr())
# print("df:\n",df.groupby("won")["dot_sim_score"].mean())

# # print("df_All:")
# # print(df_all[["dot_sim_score","won"]].corr())


In [22]:
# dot_sim = []
# for i in range (len(s1_emb)):
#     dot_sim.append(float(util.dot_score(s1_emb[i], s2_emb[i])[0]))
# print(len(dot_sim))

# df["dot_sim_score"] = dot_sim
# # df_all["dot_sim_score"] = dot_sim
# print(df[["dot_sim_score","won"]].corr())
# print("df:\n",df.groupby("won")["dot_sim_score"].mean())


In [23]:
# print("df_all:\n",df_all.groupby("won")["dot_sim_score"].mean())


### try supervised model

* Contrastive loss?
* https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/quora_duplicate_questions/training_OnlineContrastiveLoss.py
* https://www.sbert.net/examples/training/quora_duplicate_questions/README.html#training


Can combine multiple losses:
* https://www.sbert.net/examples/training/quora_duplicate_questions/README.html

In [24]:
#As distance metric, we use cosine distance (cosine_distance = 1-cosine_similarity)
distance_metric = losses.SiameseDistanceMetric.COSINE_DISTANCE

#Negative pairs should have a distance of at least 0.3 (was 0.5 orig)
margin = 0.5

# ####  Configure the training #### 
# warmup_steps = math.ceil(len(train_dataset) * num_epochs / batch_size * 0.1) # 10% of train data for warm-up
# print("Warmup-steps: {}".format(warmup_steps))


In [None]:
## results before finetuning
dev_eval(model) ## % with 7e-4 and min 4


In [None]:
train_dataset = SentencesDataset(train_samples, model=model)
# DataLoader to batch your data
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
## OnlineContrastiveLoss. - expects pairs
# train_loss = losses.OnlineContrastiveLoss(model=model, distance_metric=distance_metric, margin=margin)
train_loss = losses.CosineSimilarityLoss(model=model)


# Call the fit method
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    # weight_decay=0,
    scheduler='constantlr', 
    optimizer_params={'lr': 5e-4 ## 3e-5
                      },
    show_progress_bar=True,
    use_amp=True,
    evaluator=dev_eval,
    output_path='./cah_sbert_cos',
)


In [None]:
### dev eval
## example: https://jovian.ai/vumichien/sbert
dev_eval(model) ## 0.555 Accracy (With min 6) , 57% with 1e-3 and min 3


In [None]:
%%time
### could speed this up n^2 by getting embeddings only for unique combniations, then rejoining/merging...  (Won't work if using "Text" col)
s1_emb = model.encode(list(df[USE_TEXT_COLS[0]].values),
                      normalize_embeddings=False,
                      show_progress_bar=True,
                      batch_size=256,
                    #   convert_to_tensor=True
                    )

## black cards = quesiton/prompt
s2_emb = model.encode(list(df[USE_TEXT_COLS[1]].values),
                      normalize_embeddings=False,
                      show_progress_bar=True,
                      batch_size=256,
                    #   convert_to_tensor=True
                    )#.astype(np.float32) ## 1.5 min for 4K , with L12 miniLM, on cpu


In [None]:

dot_sim = []
cos_score = []
for i in range (len(s1_emb)):
    # dot_sim.append(util.dot_score(s1_emb[i], s2_emb[i])[0])
    dot_sim.append(float(util.dot_score(s1_emb[i], s2_emb[i])[0])) ## am I getting the right cell with these two?? (cos and dot, [i][j] ? )
    cos_score.append(float(util.cos_sim(s1_emb[i], s2_emb[i])[0]))
print(len(dot_sim))

df["dot_sim_score"] = dot_sim
df["cos_sim_score"] = cos_score
print(df[["dot_sim_score","cos_sim_score","won"]].corr().round(3))

print("df:\n",df.groupby("won")[["dot_sim_score","cos_sim_score"]].mean().round(3))

# print("df_All:")
# df_all["dot_sim_score"] = dot_sim
# print(df_all[["dot_sim_score","won"]].corr())


## Eval on df_Test
* Can speed up by getting unique sentence pairs and their embeddings
* Won't work the same for single text 

In [None]:
# df_test.drop_duplicates(["black_card_text","white_card_text"]).shape[0] # 367K rows, vs 489 K for all 

In [None]:
%%time
### could speed this up n^2 by getting embeddings only for unique combniations, then rejoining/merging...  (Won't work if using "Text" col)
s1_emb = model.encode(list(df_test[USE_TEXT_COLS[0]].values),
                      normalize_embeddings=False,
                      show_progress_bar=True,
                      batch_size=256,
                    #   convert_to_tensor=True
                    )

## black cards = quesiton/prompt
s2_emb = model.encode(list(df_test[USE_TEXT_COLS[1]].values),
                      normalize_embeddings=False,
                      show_progress_bar=True,
                      batch_size=256,
                    #   convert_to_tensor=True
                    )#.astype(np.float32) ## 1.5 min for 4K , with L12 miniLM, on cpu


In [None]:
%%time
dot_sim = []
cos_score = []
for i in range (len(s1_emb)):
    # dot_sim.append(util.dot_score(s1_emb[i], s2_emb[i])[0])
    dot_sim.append(float(util.dot_score(s1_emb[i], s2_emb[i])[0])) ## am I getting the right cell with these two?? (cos and dot, [i][j] ? )
    cos_score.append(float(util.cos_sim(s1_emb[i], s2_emb[i])[0]))
print(len(dot_sim))

df_test["dot_sim_score"] = dot_sim
df_test["cos_sim_score"] = cos_score
print(df_test[["dot_sim_score","cos_sim_score","won"]].corr().round(3))

print("df_test:\n",df_test.groupby("won")[["dot_sim_score","cos_sim_score"]].mean().round(3))


print("Acc:",df_test.sort_values("dot_sim_score",ascending=False).groupby("fake_round_id").head(1)["won"].mean()) ## 17.5% acc
print("Acc: (dot)",df_test.sort_values("dot_sim_score",ascending=False).groupby("fake_round_id").head(1)["won"].mean())
print("Acc:",df_test.sort_values("cos_sim_score",ascending=True).groupby("fake_round_id").head(1)["won"].mean()) ## 0.05
print("Acc:",df_test.sort_values("cos_sim_score",ascending=False).groupby("fake_round_id").head(1)["won"].mean())

### Pretrain - unsupervised
* TSDAE or other method ? 
* https://www.sbert.net/examples/unsupervised_learning/TSDAE/README.html#tsdae-as-pre-training-task


ST recommends MultipleNegativesRankingLoss
* https://www.sbert.net/examples/training/nli/README.html#multiplenegativesrankingloss
*  MultipleNegativesRankingLoss only requires positive pairs, i.e., we only need examples of positive/funny pairs. (BUT It also supports hard negatives in a triplet)

Also: OnlineContrastiveLoss
* https://www.sbert.net/examples/training/quora_duplicate_questions/README.html#training
* Constrative Loss / `losses.OnlineContrastiveLoss`
* Choosing the distance function and especially choosing a sensible margin are quite important for the success of constrative loss. In the given example, we use cosine_distance (which is 1-cosine_similarity) with a margin of 0.5. I.e., non-duplicate questions should have a cosine_distance of at least 0.5 (which is equivalent to a 0.5 cosine similarity difference).
* An improved version of constrative loss is OnlineConstrativeLoss, which looks which negative pairs have a lower distance that the largest positive pair and which positive pairs have a higher distance than the lowest distance of negative pairs. I.e., this loss automatically detects the hard cases in a batch and computes the loss only for these cases.

Can also do BOTH losses, as in:

* Multi-Task-Learning
    Constrative Loss works well for pair classification, i.e., given two pairs, are these duplicates or not. It pushes negative pairs far away in vector space, so that the distinguishing between duplicate and non-duplicate pairs works good.

    MultipleNegativesRankingLoss on the other sides mainly reduces the distance between positive pairs out of large set of possible candidates. However, the distance between non-duplicate questions is not so large, so that this loss does not work that weill for pair classification.

* https://www.sbert.net/examples/training/quora_duplicate_questions/README.html#multi-task-learning

More losses (e.g. triplet): https://www.sbert.net/docs/package_reference/losses.html

In [None]:
# # Define your sentence transformer model using CLS pooling
# model_name = 'bert-base-uncased'
# word_embedding_model = models.Transformer(model_name)
# pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 'cls')
# model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# # Define a list with sentences (1k - 100k sentences)
# train_sentences = ["Your set of sentences",
#                    "Model will automatically add the noise", 
#                    "And re-construct it",
#                    "You should provide at least 1k sentences"]

# # Create the special denoising dataset that adds noise on-the-fly
# train_dataset = datasets.DenoisingAutoEncoderDataset(train_sentences)

# # DataLoader to batch your data
# train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# # Use the denoising auto-encoder loss
# train_loss = losses.DenoisingAutoEncoderLoss(model, decoder_name_or_path=model_name, tie_encoder_decoder=True)

# # Call the fit method
# model.fit(
#     train_objectives=[(train_dataloader, train_loss)],
#     epochs=1,
#     weight_decay=0,
#     scheduler='constantlr',
#     optimizer_params={'lr': 3e-5},
#     show_progress_bar=True
# )

# model.save('output/tsdae-model')

#### get embeddings & dot product/codine distance over all 

* `util.semantic_search` - could do this is one step for us (but we'll want to train a model on embeddings anyway) 

* https://www.sbert.net/docs/usage/semantic_textual_similarity.html

In [None]:
# %%time
# ## white cards = most predictive information

# # ### encode_multi_process ? (needs pool enabled)
# ##     pool = model.start_multi_process_pool(encode_batch_size=2000)
# ##     embeddings = model.encode_multi_process(texts, pool)

# s1_emb = model.encode(list(df["white_card_text"].values),
#                       normalize_embeddings=True,
#                       show_progress_bar=True,
#                       batch_size=128,
#                     #   convert_to_tensor=True 
#                       )#.astype(np.float32) ## 40s for 4K , with L12 miniLM, on cpu
# ### 7 min for all min7 (37k) with L12
# print(s1_emb.shape,"s1_emb.shape")

In [None]:
# %%time
# ### 8.7 min on colab for L12, min 4 (371K)

# s1_emb = model.encode(list(df["white_card_text"].values),
#                       normalize_embeddings=True,
#                       show_progress_bar=True,
#                       batch_size=128,
#                       normalize_embeddings=True
#                     #   convert_to_tensor=True
#                     )

# ## black cards = quesiton/prompt
# s2_emb = model.encode(list(df["black_card_text"].values),
#                       normalize_embeddings=True,
#                       show_progress_bar=True,
#                       batch_size=256,
#                       normalize_embeddings=True
#                     #   convert_to_tensor=True
#                     )#.astype(np.float32) ## 1.5 min for 4K , with L12 miniLM, on cpu
# print(s2_emb.shape)


# dot_sim = []
# for i in range (len(s1_emb)):
#     dot_sim.append(util.dot_score(s1_emb[i], s2_emb[i])[0])
# print(len(dot_sim))
# dot_sim

In [None]:
# len(s2_emb[0]) # 384 = dim of the "first" sample

In [None]:
# ### Hopefully I am doing this correctly ??? 
# ### get cosine similarity 
# output = []
# for i in range (len(s1_emb)):
#     output.append(util.cos_sim(s1_emb[i], s2_emb[i])[0])
# print(len(output))

In [None]:
%%time
#Compute cosine similarity between all pairs - outputs matrix of shape S1 X S2 (i.e # samples = ineffecient in memory!)
# output = util.pytorch_cos_sim(s1_emb, s2_emb)
# output = util.cos_sim(s1_emb, s2_emb) # ORIG, nXn matrix
# print(output.shape)


In [None]:
# %%time
# ### Hopefully I am doing this correctly ??? 
# # # ## dot product:
# # dot_sim = util.dot_score(s1_emb, s2_emb) # memory crash? 
# # print(dot_sim.shape)

# dot_sim = []
# for i in range (len(s1_emb)):
#     dot_sim.append(util.dot_score(s1_emb[i], s2_emb[i])[0])
# print(len(dot_sim))
# dot_sim

In [None]:
# print("cos done")

In [None]:
%%time
# from torch.nn import CosineSimilarity, PairwiseDistance
# ## https://pytorch.org/docs/stable/generated/torch.nn.CosineSimilarity.html
# nn_cos = CosineSimilarity(dim=1, eps=1e-6)
# output = nn_cos(s1_emb, s2_emb) 
# print("output",output.shape)
# nn_pairwiseDist = PairwiseDistance()
# output_2 = nn_pairwiseDist(s1_emb, s2_emb)
# print("output_2",output_2.shape)

# # ## dot product:
# output_3 = util.dot_score(s1_emb, s2_emb)

In [None]:
# df["cos_sim"] = output
# df["pairwiseDist_sim"] = output_2
# df["dot_score"] = output_3

In [None]:
# df.corr()

In [None]:
# df.groupby("picks")[["cos_sim","pairwiseDist_sim"]].mean()

#### Model on embeddings 
* linear model on embeddings per sentence and cossim, pairwise dist score;
* + difference, +- multiplication of vectors

In [None]:
# %%time
# ### mean diff, math/ mult per row: 

# ## https://stackoverflow.com/questions/50430585/mean-difference-of-two-numpy-arrays
# # np.mean(np.abs(s1_emb[:, None] - s2_emb))
# vector_diffs = s1_emb - s2_emb
# mean_diff = np.mean(vector_diffs,axis=1) # 1 col
# max_diff = np.min(vector_diffs,axis=1) 
# min_diff = np.max(vector_diffs,axis=1) 
# # np_dot = np.dot(s1_emb,s2_emb)
# print(mean_diff)
# print(max_diff)

In [None]:
y = df["picks"].values
# X = np.concatenate([s1_emb,s2_emb],axis=1) # BOTH 
X = s2_emb # BOTH 
# X = np.concatenate([s1_emb,vector_diffs],axis=1)## ALT - white cards + diffs

print(X.shape)

In [None]:
# X = np.column_stack((X,output)) ### memory crash ? 
# X = np.column_stack((X,dot_sim)) ## dot product 

# # X = np.column_stack((X,output_2))
# # X = np.column_stack((X,output_3))
# X = np.column_stack((X,mean_diff))
# X = np.column_stack((X,max_diff))
# X = np.column_stack((X,min_diff))
# print(X.shape)

In [None]:
df["text"].str.split().str.len().max()

In [None]:
from sklearn.model_selection import train_test_split
# y = df["picks"].values
# # X = np.concatenate([s1_emb,s2_emb],axis=1) # BOTH 
# X = list(df["black_card_text"].values)

X_train, X_test, y_train, y_test = train_test_split(
    list(df["text"].values), list(df["picks"].values), test_size=0.25, random_state=42)

In [None]:
set(y_train)

In [None]:
# Try finetuning? 

In [None]:
### https://www.sbert.net/docs/training/overview.html
from torch import nn

# word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=256)
# pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=128, activation_function=nn.Tanh())

# model2 = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])
model2 = SentenceTransformer(modules=[model, dense_model])

In [None]:
len(y_train)

In [None]:
# [X_train[0:5],y_train[0:5]]
## list(df["black_card_text"].values)
train_examples =  [InputExample(texts=[X_train[i]],label=float(y_train[i])) for i in range(len(X_train)-1)]
# [X_train[0:5],y_train[0:5]]

In [None]:
range(len(X_train)-1)

In [None]:
X_train[0]

In [None]:
len(y_train)

In [None]:
len(X_train)

In [None]:
len(train_examples)

In [None]:
### IndexError: list index out of range. ? 


#Define your train examples. You need more than just two examples...
# train_examples = [InputExample(texts=['My first sentence', 'My second sentence'], label=0.8),
#     InputExample(texts=['Another pair', 'Unrelated sentence'], label=0.3)]

# train_examples =  [X_train[0:5],y_train[0:5]]

#Define your train dataset, the dataloader and the train loss
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=64)
# train_loss = losses.CosineSimilarityLoss(model)

# #Tune the model
# model2.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=10)

In [None]:
### https://www.sbert.net/docs/package_reference/losses.html#batchsemihardtripletloss
## supports 1 sentence/label pair
train_dataset = SentencesDataset(train_examples, model2)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=256)
train_loss = losses.BatchSemiHardTripletLoss(model=model2)


#Tune the model
model2.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=50)

## Perform Semantic Search