In [None]:
import pandas as pd
matchData=pd.read_csv("matchedVector_Oct.csv")
matchData

In [None]:
print(matchData['id_user'].nunique()) #number of users
print(matchData['id_matched'].nunique())#number of candidates

In [None]:
#initialize language model
import tensorflow_hub as hub
import numpy as np
import tensorflow_text

japanese_sentences = ["犬", "子犬はいいです", "私は犬と一緒にビーチを散歩するのが好きです"]
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")
ja_result = embed(japanese_sentences)
ja_result

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

#select features (include everything but text)
user_X=matchData.loc[:,"gender_公開したくない_user":"coping_strategy_飲酒_user"]
target_X=matchData.loc[:,"gender_公開したくない_matched":"coping_strategy_飲酒_matched"]
user_X_Id=matchData.loc[:,["id_user"]]
isMatched=matchData.loc[:,["isMatched"]]

#vectorize the languages
loss_description_user=matchData.loc[:,"loss_description_user"]
loss_vector_col_name = ["loss_user_"+str(num) for num in range(0, 512)]
loss_vector = pd.DataFrame(embed(loss_description_user).numpy(),columns=loss_vector_col_name)

loss_description_matched=matchData.loc[:,"loss_description_matched"]
loss_vector_match_col_name = ["loss_user_matched_"+str(num) for num in range(0, 512)]
loss_vector_matched = pd.DataFrame(embed(loss_description_matched).numpy(),columns=loss_vector_match_col_name)

hobbies_description=matchData.loc[:,"hobbies_description_user"]
hobbies_col_name = ["hobbies_description_"+str(num) for num in range(0, 512)]
hobbies_vector = pd.DataFrame(embed(hobbies_description).numpy(),columns=hobbies_col_name)

hobbies_description_matched=matchData.loc[:,"hobbies_description_matched"]
hobbies_col_name_matched = ["hobbies_description_matched_"+str(num) for num in range(0, 512)]
hobbies_vector_matched = pd.DataFrame(embed(hobbies_description_matched).numpy(),columns=hobbies_col_name_matched)


matchEncodedData=pd.DataFrame(pd.concat([
    pd.DataFrame(user_X,columns=user_X.columns),loss_vector,hobbies_vector,
    pd.DataFrame(target_X,columns=target_X.columns),loss_vector_matched,hobbies_vector_matched,
    pd.DataFrame(user_X_Id,columns=user_X_Id.columns),
    pd.DataFrame(isMatched,columns=isMatched.columns)
]
    ,axis=1))

matchEncodedData

In [None]:
import random

def getExistingTrainingToTest(train_index,test_index,ratio,numMatches):
    pickNum=round(numMatches*ratio)
    numUsers=round(len(test_index)/numMatches)
    index=0
    new_train_index=[]
    new_test_index=[]
    moveList=[]
    for index in range(numUsers):
        randomIndexList=random.sample(range(numMatches), pickNum)
        randomIndexList=[x+(index*numMatches) for x in randomIndexList]
        moveList = moveList+[test_index[x] for x in randomIndexList]
        index=index+1
    #move the numbers in the move list to the test and train index
    new_test_index=list(filter(lambda x: not(x in moveList),test_index))
    new_train_index=list(train_index)
    move_index=moveList
    return new_train_index,move_index,new_test_index

In [None]:
import random
import pickle
import tensorflow as tf
from sklearn.model_selection import train_test_split
from keras.models import Sequential,Model
from keras.layers import Dense,Dropout,Flatten,Input,Lambda
from keras.metrics import AUC

from keras import backend as K

def testDLSiamese(X_train, X_test, Y_train, groupString):
    X_train=X_train.drop(groupString,axis=1)
    X_train=X_train.drop("isMatched",axis=1)
    X_test=X_test.drop(groupString,axis=1)
    X_test=X_test.drop("isMatched",axis=1)
    
    #Start training
    X_train=np.array(X_train)
    X_test=np.array(X_test)
    Y_train=np.array(Y_train)

    half_size=round(len(X_train[0])/2)
    input_1 = Input(shape=(half_size,))
    layer_1 = Dense(512, activation='relu')(input_1)
    layer_1 = Dropout(0.4)(layer_1)
    layer_1 = Dense(32, activation='relu')(layer_1)
    layer_1 = Dropout(0.4)(layer_1)
    layer_1 = Dense(8, activation='relu')(layer_1)
    layer_1 = Dropout(0.4)(layer_1)
    
    input_2 = Input(shape=(half_size,))
    layer_2 = Dense(512, activation='relu')(input_2)
    layer_2 = Dropout(0.4)(layer_2)
    layer_2 = Dense(32, activation='relu')(layer_2)
    layer_2 = Dropout(0.4)(layer_2)
    layer_2 = Dense(8, activation='relu')(layer_2)
    layer_2 = Dropout(0.4)(layer_2)
    
    l1_norm = lambda x: 1 - K.abs(x[0] - x[1])
    merged = Lambda(function=l1_norm, output_shape=lambda x: x[0], name='L1_distance')([layer_1, layer_2])
    predictions = Dense(1, activation='sigmoid', name='classification_layer')(merged)

    model = Model([input_1, input_2], predictions)
    model.compile(loss='binary_crossentropy',optimizer="adam", metrics=[AUC(name='auc')])    
    result=model.fit([X_train[:,:half_size],X_train[:,half_size:]],np.array(Y_train),epochs=20,batch_size=8,validation_split=0.2)
    
    predict_y=model.predict([X_test[:,:half_size],X_test[:,half_size:]])

    return predict_y

In [None]:
from collections import defaultdict

from surprise import Dataset
from surprise import SVD
from surprise.model_selection import KFold
from surprise import Reader
from surprise import accuracy

def NormalizeData(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data))

def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""
        
    precisions = dict()
    recalls = dict()
    for uid, user_ratings in predictions.items():
        
        # normalize from zero to one
        getNormalizeNumber=[]
        for eachRating in user_ratings:
            getNormalizeNumber.append(eachRating[0])
            
        normalized=NormalizeData(getNormalizeNumber)
        
        newRating=[]
        for index,eachRating in enumerate(user_ratings):
            newRating.append((normalized[index],eachRating[1]))
        user_ratings=newRating
        
        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        print(user_ratings)
        
        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        
        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

        if n_rel == 0:
            recalls[uid] = -1
        
    return precisions, recalls

In [None]:
#for parameter optimization
from sklearn.model_selection import GridSearchCV

#for evaluation 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import LeaveOneGroupOut

from sklearn import preprocessing
from collections import defaultdict

import numpy as np

#ignore all warnings
import warnings
warnings.filterwarnings("ignore")

import logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)

from statistics import mean

def testEverythingGroupSIAM(X,Y,groupString,k_value,ratio):
    group=X[groupString].values
    skf = LeaveOneGroupOut()
    skf.get_n_splits(X,Y,group)
    print(skf)
    
    siam_p_k=[]
    siam_r_k=[]
    
    for train_index, test_index in skf.split(X,Y,group):
        train_index,move_index,test_index=getExistingTrainingToTest(train_index,test_index,ratio,51)
        
        X_train = X[X.index.isin(train_index)]
        X_train_moved=X[X.index.isin(move_index)]
        X_test = X[X.index.isin(test_index)]

        Y_train = Y[Y.index.isin(train_index)]
        Y_train_moved=Y[Y.index.isin(move_index)]
        Y_test = Y[Y.index.isin(test_index)]
            
        X_train=pd.DataFrame(pd.concat([
           pd.DataFrame(X_train),
           pd.DataFrame(X_train_moved)
        ]))
        
        Y_train=pd.DataFrame(pd.concat([
           pd.DataFrame(Y_train),
           pd.DataFrame(Y_train_moved)
        ]))
    
        y_pred= testDLSiamese(X_train,X_test,Y_train,groupString)
        
        #for this to work, we need to format it correctly [uid]=(predicted,true)
        prec_cal=defaultdict(list)
        
        counter=0 #don't use index as it doesn't start at zero
        for index,eachTest in X_test.iterrows():
            prec_cal[eachTest['id_user']].append((y_pred[counter][0],Y_test['isMatched'].values[counter]))
            counter=counter+1

        precisions, recalls = precision_recall_at_k(prec_cal, k=k_value, threshold=0.5)
        print(sum(prec for prec in precisions.values()) / len(precisions))
        print(sum(rec for rec in recalls.values()) / len(recalls))        
        siam_p_k.append(sum(prec for prec in precisions.values()) / len(precisions))
        siam_r_k.append(sum(rec for rec in recalls.values()) / len(recalls))  
        
        
    print("Siam")
    print(siam_p_k)
    print(mean([x for x in siam_p_k  if x != -1]))
    print(siam_r_k)
    print(mean([x for x in siam_r_k  if x != -1]))

In [None]:
X=matchEncodedData
Y=isMatched

testEverythingGroupSIAM(X,Y,"id_user",5,0)

In [None]:
testEverythingGroupSIAM(X,Y,"id_user",5,0.1)

In [None]:
testEverythingGroupSIAM(X,Y,"id_user",5,0.25)

In [None]:
testEverythingGroupSIAM(X,Y,"id_user",5,0.5)

In [None]:
testEverythingGroupSIAM(X,Y,"id_user",5,0.75)

In [None]:
testEverythingGroupSIAM(X,Y,"id_user",10,0)

In [None]:
testEverythingGroupSIAM(X,Y,"id_user",10,0.1)

In [None]:
testEverythingGroupSIAM(X,Y,"id_user",10,0.25)

In [None]:
testEverythingGroupSIAM(X,Y,"id_user",10,0.5)

In [None]:
testEverythingGroupSIAM(X,Y,"id_user",10,0.75)