# PCA and output the tensor

In [10]:
from sklearn.decomposition import PCA 
import pickle
from tqdm import tqdm
import os

In [17]:
import numpy as np

In [11]:
pickle_file='../data/src_data_NEW_small_0428/train_ref.txtemb.pkl'
n_components=384
tensor_file='../../S3CMTF_code/src_data_NEW_0429_PCA_dim_384/aug_rand_model_19.s3cmtf_train.tensor'




def pca_pairs(pickle_file,n_components):
    # 1. read the pickle file
    print('# 1. read the pickle file')
    with open(pickle_file, "rb") as f:
        pairs = pickle.load(f)

    # 2. build the all_embeddings_nparray
    print('# 2. build the all_embeddings_nparray')
    all_embeddings=[]
    for pair in pairs:
        all_embeddings.append(pair['res_emb'])
    all_embeddings_nparray=np.array(all_embeddings)

    # 3. PCA
    print('# 3. PCA')
    pca=PCA(n_components=n_components)
    pca_all_embeddings_nparray = pca.fit_transform(all_embeddings_nparray)
    print('var %:',np.sum(pca.explained_variance_ratio_))

    # 4. get the new emb
    print('# 4. get the new emb')
    for pair in pairs:
        pair['pca_res_emb']=pca_all_embeddings_nparray[pair['res_emb_idx']]
    
    return pairs

In [12]:
def pca_pairs_two_file(pickle_file,pickle_file_2,n_components):
    # 1. read the pickle file
    print('# 1. read the pickle file')
    with open(pickle_file, "rb") as f:
        pairs = pickle.load(f)

    # 2. build the all_embeddings_nparray
    print('# 2. build the all_embeddings_nparray')
    all_embeddings=[]
    for pair in pairs:
        all_embeddings.append(pair['res_emb'])
    all_embeddings_nparray=np.array(all_embeddings)

    # 1. read the pickle file
    print('# 1. read the pickle file')
    with open(pickle_file_2, "rb") as f:
        pairs_2 = pickle.load(f)

    # 2. build the all_embeddings_nparray
    print('# 2. build the all_embeddings_nparray')
    all_embeddings_2=[]
    for pair in pairs_2:
        all_embeddings_2.append(pair['res_emb'])
    all_embeddings_nparray_2=np.array(all_embeddings_2)

    # 3. PCA
    all_embeddings_nparray_both=np.concatenate((all_embeddings_nparray, all_embeddings_nparray_2), axis=0)
    print('# 3. PCA')
    pca=PCA(n_components=n_components)
    pca_all_embeddings_nparray = pca.fit_transform(all_embeddings_nparray_both)
    print('var %:',np.sum(pca.explained_variance_ratio_))

    # 4. get the new emb
    print('# 4. get the new emb')
    for pair in pairs:
        pair['pca_res_emb']=pca_all_embeddings_nparray[pair['res_emb_idx']]
    print('# 4. get the new emb')
    for pair in pairs_2:
        pair['pca_res_emb']=pca_all_embeddings_nparray[pair['res_emb_idx']+all_embeddings_nparray.shape[0]]
    
    
    return pairs+pairs_2

In [13]:
def pca_pairs_three_file(pickle_file,pickle_file_2,pickle_file_3,n_components):
    # 1. read the pickle file
    print('# 1. read the pickle file')
    with open(pickle_file, "rb") as f:
        pairs = pickle.load(f)

    # 2. build the all_embeddings_nparray
    print('# 2. build the all_embeddings_nparray')
    all_embeddings=[]
    for pair in pairs:
        all_embeddings.append(pair['res_emb'])
    all_embeddings_nparray=np.array(all_embeddings)

    # 1. read the pickle file
    print('# 1. read the pickle file')
    with open(pickle_file_2, "rb") as f:
        pairs_2 = pickle.load(f)

    # 2. build the all_embeddings_nparray
    print('# 2. build the all_embeddings_nparray')
    all_embeddings_2=[]
    for pair in pairs_2:
        all_embeddings_2.append(pair['res_emb'])
    all_embeddings_nparray_2=np.array(all_embeddings_2)
    # 1. read the pickle file
    print('# 1. read the pickle file')
    with open(pickle_file_3, "rb") as f:
        pairs_3 = pickle.load(f)

    # 2. build the all_embeddings_nparray
    print('# 2. build the all_embeddings_nparray')
    all_embeddings_3=[]
    for pair in pairs_3:
        all_embeddings_3.append(pair['res_emb'])
    all_embeddings_nparray_3=np.array(all_embeddings_3)

    # 3. PCA
    all_embeddings_nparray_both=np.concatenate((all_embeddings_nparray, all_embeddings_nparray_2,all_embeddings_nparray_3), axis=0)
    print('# 3. PCA')
    pca=PCA(n_components=n_components)
    pca_all_embeddings_nparray = pca.fit_transform(all_embeddings_nparray_both)
    print('var %:',np.sum(pca.explained_variance_ratio_))

    # 4. get the new emb
    print('# 4. get the new emb')
    for pair in pairs:
        pair['pca_res_emb']=pca_all_embeddings_nparray[pair['res_emb_idx']]
    print('# 4. get the new emb')
    for pair in pairs_2:
        pair['pca_res_emb']=pca_all_embeddings_nparray[pair['res_emb_idx']+all_embeddings_nparray.shape[0]]
    
    print('# 4. get the new emb')
    for pair in pairs_3:
        pair['pca_res_emb']=pca_all_embeddings_nparray[pair['res_emb_idx']+all_embeddings_nparray.shape[0]+all_embeddings_nparray_2.shape[0]]
    
    
    return pairs,pairs_2,pairs_3

In [23]:
def output_tensor(pairs,tensor_file):
    # 5. output the tensor.
    num_entries=0
    max_context_id=0
    max_user_id=0
    with open(tensor_file,'w') as fout:
        for pair in tqdm(pairs):
            context_tensor_id = pair['cid']
            if context_tensor_id > max_context_id:
                max_context_id=context_tensor_id
            
            user_idx = pair['pid']
            if user_idx>max_user_id:
                max_user_id=user_idx
            emb=pair['pca_res_emb']
            emb_dim=len(emb)
            for k in range(emb_dim):
                value = emb[k]
                fout.write('{}\t{}\t{}\t{}\n'.format(int(context_tensor_id),
                                                     int(user_idx),k+1,value))
                num_entries+=1    
    # 6. output the statistics of the tensor
    print('finished tensor at',tensor_file)
    print('tensor shape,',max_context_id,max_user_id,emb_dim,'num entries:',num_entries)
    return max_context_id,max_user_id,emb_dim,num_entries

In [31]:

def output_a_dir(pairs,pairs_2,pairs_3,base_dir,dim1,dim2,dim3,num_thread,user_feature_line,context_feature_line):
    # output tensor
    tensor_file=os.path.join(base_dir,'s3cmtf_train.tensor')
    max_context_id,max_user_id,emb_dim,train_num_entries = output_tensor(pairs+pairs_2,tensor_file)

    tensor_file=os.path.join(base_dir,'s3cmtf_dev.tensor')
    _,_,_,dev_num_entries=output_tensor(pairs_3,tensor_file)

    # build the config file
    config_fing=os.path.join(base_dir,'s3cmtf_config.txt')
    with open(config_fing,'w') as fout:
        fout.write('3\n')
        fout.write('{} {} {}\n'.format(max_context_id,max_user_id,emb_dim))
        fout.write('{} {} {}\n'.format(dim1,dim2,dim3))
        fout.write('{}\n'.format(num_thread))
        fout.write('{}\n'.format(train_num_entries))
        fout.write('{}\n'.format(dev_num_entries))
        fout.write(user_feature_line)
        fout.write(context_feature_line)
    print('save config file at',config_fing)
    print('base dir: ',base_dir)


In [30]:

pickle_file='../../S3CMTF_code/src_data_NEW_0429_PCA_dim_384/train_ref.txt.emb.pkl'
pickle_file_2='../../S3CMTF_code/src_data_NEW_0429_PCA_dim_384/aug_augmentation_ref_0.00045.txt.emb.pkl'
n_components=280
pairs=pca_pairs_two_file(pickle_file,pickle_file_2,n_components)


tensor_file='../../S3CMTF_code/src_data_NEW_0429_PCA_dim_384/aug_rand_model_19.s3cmtf_train.tensor'
output_tensor(pairs,tensor_file)

# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 3. PCA
var %: 0.96534324284166
# 4. get the new emb
# 4. get the new emb


100%|██████████| 1742560/1742560 [13:08<00:00, 2208.76it/s]


finished tensor at ../../S3CMTF_code/src_data_NEW_0429_PCA_dim_384/aug_rand_model_19.s3cmtf_train.tensor
tensor shape, 376443 5185 280 num entries: 487916800


In [41]:

pickle_file='../../S3CMTF_code/src_data_NEW_0429_PCA_dim_384/train_ref.txt.emb.pkl'
pickle_file_2='../../S3CMTF_code/src_data_NEW_0429_PCA_dim_384/aug_augmentation_ref_0.00045.txt.emb.pkl'
pickle_file_3='../../S3CMTF_code/src_data_NEW_0429_PCA_dim_384/val_ref.txt.emb.pkl'
n_components=280
pairs,pairs_2,pairs_3=pca_pairs_three_file(pickle_file,pickle_file_2,pickle_file_3,n_components)



# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 3. PCA
var %: 0.9653652504763401
# 4. get the new emb
# 4. get the new emb
# 4. get the new emb


In [42]:
len(pairs_3)

5185

In [43]:
tensor_file='../../S3CMTF_code/src_data_NEW_0429_PCA_dim_384/aug_rand_model_19.s3cmtf_train.tensor'
output_tensor(pairs+pairs_2,tensor_file)

100%|██████████| 1742560/1742560 [14:31<00:00, 1999.94it/s]


finished tensor at ../../S3CMTF_code/src_data_NEW_0429_PCA_dim_384/aug_rand_model_19.s3cmtf_train.tensor
tensor shape, 376443 5185 280 num entries: 487916800


In [44]:
tensor_file='../../S3CMTF_code/src_data_NEW_0429_PCA_dim_384/aug_rand_model_19.s3cmtf_dev.tensor'
output_tensor(pairs_3,tensor_file)

100%|██████████| 5185/5185 [00:02<00:00, 2270.94it/s]

finished tensor at ../../S3CMTF_code/src_data_NEW_0429_PCA_dim_384/aug_rand_model_19.s3cmtf_dev.tensor
tensor shape, 376400 5185 280 num entries: 1451800





In [29]:
500000000/1742560

286.9341658249931

In [None]:
# there is still a problem here. 

# 200K dataaset

In [29]:
base_dir='../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/speaker+question_TF_40_model_8_round_1/x1'
aug_file='aug_speaker+question_TF_40_model_8_round_1_x1_augmentation_ref_0.001.txt.emb.pkl'

n_components=240
dim1,dim2,dim3=30,30,30
num_thread=36
user_feature_line='1 ../../../../src_data_200K_May_12/s3cmtf_user_feature_onehot.mat 9041 42709684\n'
context_feature_line='2 ../../../../src_data_200K_May_12/s3cmtf_context_emb.mat 768 30095616\n'
pickle_file='../../S3CMTF_code/src_data_200K_May_12/train_ref.txt.emb.pkl'
pickle_file_2=os.path.join(base_dir,aug_file)
pickle_file_3='../../S3CMTF_code/src_data_200K_May_12/val_ref.txt.emb.pkl'

pairs,pairs_2,pairs_3=pca_pairs_three_file(pickle_file,pickle_file_2,pickle_file_3,n_components)


output_a_dir(pairs,pairs_2,pairs_3,base_dir,dim1,dim2,dim3,num_thread,user_feature_line,context_feature_line)
# ./S3CMTF-opt s3cmtf_config.txt  s3cmtf_train.tensor s3cmtf_dev.tensor 30x30x30_0.0005_1_4 2 0.0005 1 4 100

# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 3. PCA


  0%|          | 0/356048 [00:00<?, ?it/s]

var %: 0.9563737821021712
# 4. get the new emb
# 4. get the new emb
# 4. get the new emb


100%|██████████| 356048/356048 [02:13<00:00, 2668.56it/s]
  6%|▌         | 260/4724 [00:00<00:01, 2599.05it/s]

finished tensor at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/speaker+question_TF_40_model_8_round_1/x1/s3cmtf_train.tensor
tensor shape, 39187 4724 240 num entries: 85451520


100%|██████████| 4724/4724 [00:01<00:00, 2658.98it/s]

finished tensor at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/speaker+question_TF_40_model_8_round_1/x1/s3cmtf_dev.tensor
tensor shape, 39157 4724 240 num entries: 1133760
save config file at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/speaker+question_TF_40_model_8_round_1/x1/s3cmtf_train.tensor
base dir:  ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/speaker+question_TF_40_model_8_round_1/x1





In [30]:
base_dir='../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/speaker+question_TF_40_model_8_round_1/x2'
aug_file='aug_speaker+question_TF_40_model_8_round_1_x2_augmentation_ref_0.002.txt.emb.pkl'

n_components=240
dim1,dim2,dim3=30,30,30
num_thread=36
user_feature_line='1 ../../../../src_data_200K_May_12/s3cmtf_user_feature_onehot.mat 9041 42709684\n'
context_feature_line='2 ../../../../src_data_200K_May_12/s3cmtf_context_emb.mat 768 30095616\n'
pickle_file='../../S3CMTF_code/src_data_200K_May_12/train_ref.txt.emb.pkl'
pickle_file_2=os.path.join(base_dir,aug_file)
pickle_file_3='../../S3CMTF_code/src_data_200K_May_12/val_ref.txt.emb.pkl'

pairs,pairs_2,pairs_3=pca_pairs_three_file(pickle_file,pickle_file_2,pickle_file_3,n_components)


output_a_dir(pairs,pairs_2,pairs_3,base_dir,dim1,dim2,dim3,num_thread,user_feature_line,context_feature_line)
# ./S3CMTF-opt s3cmtf_config.txt  s3cmtf_train.tensor s3cmtf_dev.tensor 30x30x30_0.0005_1_4 2 0.0005 1 4 100

# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 3. PCA
var %: 0.9575900674610852
# 4. get the new emb
# 4. get the new emb
# 4. get the new emb


100%|██████████| 540284/540284 [03:31<00:00, 2559.53it/s]
  5%|▌         | 252/4724 [00:00<00:01, 2512.72it/s]

finished tensor at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/speaker+question_TF_40_model_8_round_1/x2/s3cmtf_train.tensor
tensor shape, 39187 4724 240 num entries: 129668160


100%|██████████| 4724/4724 [00:01<00:00, 2522.11it/s]

finished tensor at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/speaker+question_TF_40_model_8_round_1/x2/s3cmtf_dev.tensor
tensor shape, 39157 4724 240 num entries: 1133760
save config file at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/speaker+question_TF_40_model_8_round_1/x2/s3cmtf_train.tensor
base dir:  ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/speaker+question_TF_40_model_8_round_1/x2





In [32]:
base_dir='../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/speaker+question_TF_40_model_8_round_1/x5'
aug_file='aug_speaker+question_TF_40_model_8_round_1_x5_augmentation_ref_0.005.txt.emb.pkl'

n_components=240
dim1,dim2,dim3=30,30,30
num_thread=36
user_feature_line='1 ../../../../src_data_200K_May_12/s3cmtf_user_feature_onehot.mat 9041 42709684\n'
context_feature_line='2 ../../../../src_data_200K_May_12/s3cmtf_context_emb.mat 768 30095616\n'
pickle_file='../../S3CMTF_code/src_data_200K_May_12/train_ref.txt.emb.pkl'
pickle_file_2=os.path.join(base_dir,aug_file)
pickle_file_3='../../S3CMTF_code/src_data_200K_May_12/val_ref.txt.emb.pkl'

pairs,pairs_2,pairs_3=pca_pairs_three_file(pickle_file,pickle_file_2,pickle_file_3,n_components)


output_a_dir(pairs,pairs_2,pairs_3,base_dir,dim1,dim2,dim3,num_thread,user_feature_line,context_feature_line)
# ./S3CMTF-opt s3cmtf_config.txt  s3cmtf_train.tensor s3cmtf_dev.tensor 30x30x30_0.0005_1_4 2 0.0005 1 4 100

# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 3. PCA
var %: 0.9592585953188324
# 4. get the new emb
# 4. get the new emb
# 4. get the new emb


100%|██████████| 1097716/1097716 [12:43<00:00, 1436.98it/s]
  5%|▌         | 250/4724 [00:00<00:01, 2498.66it/s]

finished tensor at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/speaker+question_TF_40_model_8_round_1/x5/s3cmtf_train.tensor
tensor shape, 39187 4724 240 num entries: 263451840


100%|██████████| 4724/4724 [00:02<00:00, 1639.91it/s]

finished tensor at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/speaker+question_TF_40_model_8_round_1/x5/s3cmtf_dev.tensor
tensor shape, 39157 4724 240 num entries: 1133760
save config file at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/speaker+question_TF_40_model_8_round_1/x5/s3cmtf_config.txt
base dir:  ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/speaker+question_TF_40_model_8_round_1/x5





In [33]:
base_dir='../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/speaker+question_TF_40_model_8_round_1/x10'
aug_file='aug_speaker+question_TF_40_model_8_round_1_x10_augmentation_ref_0.01.txt.emb.pkl'

n_components=240
dim1,dim2,dim3=30,30,30
num_thread=36
user_feature_line='1 ../../../../src_data_200K_May_12/s3cmtf_user_feature_onehot.mat 9041 42709684\n'
context_feature_line='2 ../../../../src_data_200K_May_12/s3cmtf_context_emb.mat 768 30095616\n'
pickle_file='../../S3CMTF_code/src_data_200K_May_12/train_ref.txt.emb.pkl'
pickle_file_2=os.path.join(base_dir,aug_file)
pickle_file_3='../../S3CMTF_code/src_data_200K_May_12/val_ref.txt.emb.pkl'

pairs,pairs_2,pairs_3=pca_pairs_three_file(pickle_file,pickle_file_2,pickle_file_3,n_components)


output_a_dir(pairs,pairs_2,pairs_3,base_dir,dim1,dim2,dim3,num_thread,user_feature_line,context_feature_line)
# ./S3CMTF-opt s3cmtf_config.txt  s3cmtf_train.tensor s3cmtf_dev.tensor 30x30x30_0.0005_1_4 2 0.0005 1 4 100

# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 3. PCA
var %: 0.9607122996355475
# 4. get the new emb
# 4. get the new emb
# 4. get the new emb


100%|██████████| 2023620/2023620 [24:02<00:00, 1402.41it/s]


finished tensor at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/speaker+question_TF_40_model_8_round_1/x10/s3cmtf_train.tensor
tensor shape, 39187 4724 240 num entries: 485668800


100%|██████████| 4724/4724 [00:03<00:00, 1440.78it/s]

finished tensor at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/speaker+question_TF_40_model_8_round_1/x10/s3cmtf_dev.tensor
tensor shape, 39157 4724 240 num entries: 1133760
save config file at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/speaker+question_TF_40_model_8_round_1/x10/s3cmtf_config.txt
base dir:  ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/speaker+question_TF_40_model_8_round_1/x10





In [34]:
base_dir='../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/speaker+question_TF_40_model_8_round_2/x1/only_on_real_data'
aug_file='x1_augmentation_ref_0.001.txt.emb.pkl'


n_components=240
dim1,dim2,dim3=30,30,30
num_thread=36
user_feature_line='1 ../../../../src_data_200K_May_12/s3cmtf_user_feature_onehot.mat 9041 42709684\n'
context_feature_line='2 ../../../../src_data_200K_May_12/s3cmtf_context_emb.mat 768 30095616\n'
pickle_file='../../S3CMTF_code/src_data_200K_May_12/train_ref.txt.emb.pkl'
pickle_file_2=os.path.join(base_dir,aug_file)
pickle_file_3='../../S3CMTF_code/src_data_200K_May_12/val_ref.txt.emb.pkl'

pairs,pairs_2,pairs_3=pca_pairs_three_file(pickle_file,pickle_file_2,pickle_file_3,n_components)


output_a_dir(pairs,pairs_2,pairs_3,base_dir,dim1,dim2,dim3,num_thread,user_feature_line,context_feature_line)
# ./S3CMTF-opt s3cmtf_config.txt  s3cmtf_train.tensor s3cmtf_dev.tensor 30x30x30_0.0005_1_4 2 0.0005 1 4 100

# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 3. PCA
var %: 0.9550656707261022
# 4. get the new emb
# 4. get the new emb
# 4. get the new emb


100%|██████████| 356048/356048 [04:08<00:00, 1432.97it/s]
  3%|▎         | 152/4724 [00:00<00:03, 1516.29it/s]

finished tensor at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/speaker+question_TF_40_model_8_round_2/x1/only_on_real_data/s3cmtf_train.tensor
tensor shape, 39187 4724 240 num entries: 85451520


100%|██████████| 4724/4724 [00:03<00:00, 1508.56it/s]

finished tensor at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/speaker+question_TF_40_model_8_round_2/x1/only_on_real_data/s3cmtf_dev.tensor
tensor shape, 39157 4724 240 num entries: 1133760
save config file at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/speaker+question_TF_40_model_8_round_2/x1/only_on_real_data/s3cmtf_config.txt
base dir:  ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/speaker+question_TF_40_model_8_round_2/x1/only_on_real_data





In [35]:
base_dir='../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/speaker+question_TF_40_model_8_round_2/x1/ST(PT+FT)'
aug_file='x1_augmentation_ref_0.001.txt.emb.pkl'


n_components=240
dim1,dim2,dim3=30,30,30
num_thread=36
user_feature_line='1 ../../../../src_data_200K_May_12/s3cmtf_user_feature_onehot.mat 9041 42709684\n'
context_feature_line='2 ../../../../src_data_200K_May_12/s3cmtf_context_emb.mat 768 30095616\n'
pickle_file='../../S3CMTF_code/src_data_200K_May_12/train_ref.txt.emb.pkl'
pickle_file_2=os.path.join(base_dir,aug_file)
pickle_file_3='../../S3CMTF_code/src_data_200K_May_12/val_ref.txt.emb.pkl'

pairs,pairs_2,pairs_3=pca_pairs_three_file(pickle_file,pickle_file_2,pickle_file_3,n_components)


output_a_dir(pairs,pairs_2,pairs_3,base_dir,dim1,dim2,dim3,num_thread,user_feature_line,context_feature_line)
# ./S3CMTF-opt s3cmtf_config.txt  s3cmtf_train.tensor s3cmtf_dev.tensor 30x30x30_0.0005_1_4 2 0.0005 1 4 100

# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 3. PCA
var %: 0.9583137852138899
# 4. get the new emb
# 4. get the new emb
# 4. get the new emb


100%|██████████| 356048/356048 [04:08<00:00, 1432.82it/s]
  3%|▎         | 154/4724 [00:00<00:02, 1538.73it/s]

finished tensor at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/speaker+question_TF_40_model_8_round_2/x1/ST(PT+FT)/s3cmtf_train.tensor
tensor shape, 39187 4724 240 num entries: 85451520


100%|██████████| 4724/4724 [00:03<00:00, 1367.83it/s]

finished tensor at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/speaker+question_TF_40_model_8_round_2/x1/ST(PT+FT)/s3cmtf_dev.tensor
tensor shape, 39157 4724 240 num entries: 1133760
save config file at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/speaker+question_TF_40_model_8_round_2/x1/ST(PT+FT)/s3cmtf_config.txt
base dir:  ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/speaker+question_TF_40_model_8_round_2/x1/ST(PT+FT)





In [37]:

def output_a_dir_only_real_on_train(pairs,pairs_2,pairs_3,base_dir,dim1,dim2,dim3,num_thread,user_feature_line,context_feature_line):
    # output tensor
    tensor_file=os.path.join(base_dir,'s3cmtf_train.tensor')
    max_context_id,max_user_id,emb_dim,train_num_entries = output_tensor(pairs,tensor_file)

    tensor_file=os.path.join(base_dir,'s3cmtf_dev.tensor')
    _,_,_,dev_num_entries=output_tensor(pairs_3,tensor_file)

    # build the config file
    config_fing=os.path.join(base_dir,'s3cmtf_config.txt')
    with open(config_fing,'w') as fout:
        fout.write('3\n')
        fout.write('{} {} {}\n'.format(max_context_id,max_user_id,emb_dim))
        fout.write('{} {} {}\n'.format(dim1,dim2,dim3))
        fout.write('{}\n'.format(num_thread))
        fout.write('{}\n'.format(train_num_entries))
        fout.write('{}\n'.format(dev_num_entries))
        fout.write(user_feature_line)
        fout.write(context_feature_line)
    print('save config file at',config_fing)
    print('base dir: ',base_dir)


In [38]:
base_dir='../../S3CMTF_code/src_data_200K_May_12/PCA_240_no_CM'
aug_file='train_ref.txt.emb.pkl'


n_components=240
dim1,dim2,dim3=30,30,30
num_thread=36
user_feature_line='1 ../src_data_200K_May_12/s3cmtf_user_feature_onehot.mat 9041 42709684\n'
context_feature_line='2 ../src_data_200K_May_12/s3cmtf_context_emb.mat 768 30095616\n'
pickle_file='../../S3CMTF_code/src_data_200K_May_12/train_ref.txt.emb.pkl'
pickle_file_2=os.path.join(base_dir,aug_file)
pickle_file_3='../../S3CMTF_code/src_data_200K_May_12/val_ref.txt.emb.pkl'

pairs,pairs_2,pairs_3=pca_pairs_three_file(pickle_file,pickle_file_2,pickle_file_3,n_components)


output_a_dir_only_real_on_train(pairs,pairs_2,pairs_3,base_dir,dim1,dim2,dim3,num_thread,user_feature_line,context_feature_line)
# ./S3CMTF-opt s3cmtf_config.txt  s3cmtf_train.tensor s3cmtf_dev.tensor 30x30x30_0.0005_1_4 2 0.0005 1 4 100

# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 3. PCA
var %: 0.9541338010699099
# 4. get the new emb
# 4. get the new emb
# 4. get the new emb


100%|██████████| 171812/171812 [01:56<00:00, 1468.91it/s]
  3%|▎         | 149/4724 [00:00<00:03, 1488.70it/s]

finished tensor at ../../S3CMTF_code/src_data_200K_May_12/PCA_240_no_CM/s3cmtf_train.tensor
tensor shape, 39187 4724 240 num entries: 41234880


100%|██████████| 4724/4724 [00:03<00:00, 1480.60it/s]

finished tensor at ../../S3CMTF_code/src_data_200K_May_12/PCA_240_no_CM/s3cmtf_dev.tensor
tensor shape, 39157 4724 240 num entries: 1133760
save config file at ../../S3CMTF_code/src_data_200K_May_12/PCA_240_no_CM/s3cmtf_config.txt
base dir:  ../../S3CMTF_code/src_data_200K_May_12/PCA_240_no_CM





In [39]:

base_dir='../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/aug_speaker_TF_40_model_best_13_round_1/x5'

aug_file='x5_augmentation_ref_0.005.txt.emb.pkl'

n_components=240
dim1,dim2,dim3=30,30,30
num_thread=36
user_feature_line='1 ../../../../../src_data_200K_May_12/s3cmtf_user_feature_onehot.mat 9041 42709684\n'
context_feature_line='2 ../../../../../src_data_200K_May_12/s3cmtf_context_emb.mat 768 30095616\n'
pickle_file='../../S3CMTF_code/src_data_200K_May_12/train_ref.txt.emb.pkl'
pickle_file_2=os.path.join(base_dir,aug_file)
pickle_file_3='../../S3CMTF_code/src_data_200K_May_12/val_ref.txt.emb.pkl'

pairs,pairs_2,pairs_3=pca_pairs_three_file(pickle_file,pickle_file_2,pickle_file_3,n_components)


output_a_dir(pairs,pairs_2,pairs_3,base_dir,dim1,dim2,dim3,num_thread,user_feature_line,context_feature_line)
# ./S3CMTF-opt s3cmtf_config.txt  s3cmtf_train.tensor s3cmtf_dev.tensor 30x30x30_0.0005_1_4 2 0.0005 1 4 100

# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 3. PCA
var %: 0.9617193392481597
# 4. get the new emb
# 4. get the new emb
# 4. get the new emb


100%|██████████| 1097716/1097716 [08:26<00:00, 2166.86it/s]
  3%|▎         | 156/4724 [00:00<00:02, 1559.59it/s]

finished tensor at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/aug_speaker_TF_40_model_best_13_round_1/x5/s3cmtf_train.tensor
tensor shape, 39187 4724 240 num entries: 263451840


100%|██████████| 4724/4724 [00:03<00:00, 1548.07it/s]

finished tensor at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/aug_speaker_TF_40_model_best_13_round_1/x5/s3cmtf_dev.tensor
tensor shape, 39157 4724 240 num entries: 1133760
save config file at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/aug_speaker_TF_40_model_best_13_round_1/x5/s3cmtf_config.txt
base dir:  ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/aug_speaker_TF_40_model_best_13_round_1/x5





In [40]:

base_dir='../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/aug_speaker_TF_40_model_best_13_round_1/x10'

aug_file='x10_augmentation_ref_0.01.txt.emb.pkl'

n_components=240
dim1,dim2,dim3=30,30,30
num_thread=72
user_feature_line='1 ../../../../../src_data_200K_May_12/s3cmtf_user_feature_onehot.mat 9041 42709684\n'
context_feature_line='2 ../../../../../src_data_200K_May_12/s3cmtf_context_emb.mat 768 30095616\n'
pickle_file='../../S3CMTF_code/src_data_200K_May_12/train_ref.txt.emb.pkl'
pickle_file_2=os.path.join(base_dir,aug_file)
pickle_file_3='../../S3CMTF_code/src_data_200K_May_12/val_ref.txt.emb.pkl'

pairs,pairs_2,pairs_3=pca_pairs_three_file(pickle_file,pickle_file_2,pickle_file_3,n_components)


output_a_dir(pairs,pairs_2,pairs_3,base_dir,dim1,dim2,dim3,num_thread,user_feature_line,context_feature_line)
# ./S3CMTF-opt s3cmtf_config.txt  s3cmtf_train.tensor s3cmtf_dev.tensor 30x30x30_0.0003_1_4 2 0.0003 1 4 100

# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 3. PCA
var %: 0.9635511676820506
# 4. get the new emb
# 4. get the new emb
# 4. get the new emb


100%|██████████| 2023620/2023620 [19:46<00:00, 1705.85it/s]
  0%|          | 0/4724 [00:00<?, ?it/s]

finished tensor at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/aug_speaker_TF_40_model_best_13_round_1/x10/s3cmtf_train.tensor
tensor shape, 39187 4724 240 num entries: 485668800


100%|██████████| 4724/4724 [00:01<00:00, 2552.76it/s]

finished tensor at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/aug_speaker_TF_40_model_best_13_round_1/x10/s3cmtf_dev.tensor
tensor shape, 39157 4724 240 num entries: 1133760
save config file at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/aug_speaker_TF_40_model_best_13_round_1/x10/s3cmtf_config.txt
base dir:  ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/aug_speaker_TF_40_model_best_13_round_1/x10





In [41]:

base_dir='../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/30x30x30_0.0005_1_0_no_ppl_sampling_CM_round_1/x2'

aug_file='x2_augmentation_ref_0.002.txt.emb.pkl'

n_components=240
dim1,dim2,dim3=30,30,30
num_thread=36
user_feature_line=''
context_feature_line=''
pickle_file='../../S3CMTF_code/src_data_200K_May_12/train_ref.txt.emb.pkl'
pickle_file_2=os.path.join(base_dir,aug_file)
pickle_file_3='../../S3CMTF_code/src_data_200K_May_12/val_ref.txt.emb.pkl'

pairs,pairs_2,pairs_3=pca_pairs_three_file(pickle_file,pickle_file_2,pickle_file_3,n_components)


output_a_dir(pairs,pairs_2,pairs_3,base_dir,dim1,dim2,dim3,num_thread,user_feature_line,context_feature_line)
# ./S3CMTF-opt s3cmtf_config.txt  s3cmtf_train.tensor s3cmtf_dev.tensor 30x30x30_0.0005_1_0 2 0.0005 1 0 100
# todo, can we do 30x30x30_0.0005_1_0 0 0.0005 1 0 100?
# ./S3CMTF-opt s3cmtf_config.txt  s3cmtf_train.tensor s3cmtf_dev.tensor 30x30x30_0.0005_1_0 0 0.0005 1 0 100

# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 3. PCA
var %: 0.9590169202964947
# 4. get the new emb
# 4. get the new emb
# 4. get the new emb


100%|██████████| 544410/544410 [06:10<00:00, 1468.01it/s]
  3%|▎         | 154/4724 [00:00<00:02, 1536.25it/s]

finished tensor at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/30x30x30_0.0005_1_0_no_ppl_sampling_CM_round_1/x2/s3cmtf_train.tensor
tensor shape, 39187 4724 240 num entries: 130658400


100%|██████████| 4724/4724 [00:03<00:00, 1504.99it/s]

finished tensor at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/30x30x30_0.0005_1_0_no_ppl_sampling_CM_round_1/x2/s3cmtf_dev.tensor
tensor shape, 39157 4724 240 num entries: 1133760
save config file at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/30x30x30_0.0005_1_0_no_ppl_sampling_CM_round_1/x2/s3cmtf_config.txt
base dir:  ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/30x30x30_0.0005_1_0_no_ppl_sampling_CM_round_1/x2





In [42]:

base_dir='../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/30x30x30_0.0005_1_0_no_ppl_sampling_CM_round_1/x1'

aug_file='x1_augmentation_ref_0.001.txt.emb.pkl'

n_components=240
dim1,dim2,dim3=30,30,30
num_thread=36
user_feature_line=''
context_feature_line=''
pickle_file='../../S3CMTF_code/src_data_200K_May_12/train_ref.txt.emb.pkl'
pickle_file_2=os.path.join(base_dir,aug_file)
pickle_file_3='../../S3CMTF_code/src_data_200K_May_12/val_ref.txt.emb.pkl'

pairs,pairs_2,pairs_3=pca_pairs_three_file(pickle_file,pickle_file_2,pickle_file_3,n_components)


output_a_dir(pairs,pairs_2,pairs_3,base_dir,dim1,dim2,dim3,num_thread,user_feature_line,context_feature_line)
# ./S3CMTF-opt s3cmtf_config.txt  s3cmtf_train.tensor s3cmtf_dev.tensor 30x30x30_0.0005_1_0 2 0.0005 1 0 100
# todo, can we do 30x30x30_0.0005_1_0 0 0.0005 1 0 100?
# ./S3CMTF-opt s3cmtf_config.txt  s3cmtf_train.tensor s3cmtf_dev.tensor 30x30x30_0.0005_1_0 0 0.0005 1 0 100

# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 3. PCA
var %: 0.9573517277869287
# 4. get the new emb
# 4. get the new emb
# 4. get the new emb


100%|██████████| 359290/359290 [04:02<00:00, 1479.72it/s]
  3%|▎         | 153/4724 [00:00<00:03, 1521.04it/s]

finished tensor at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/30x30x30_0.0005_1_0_no_ppl_sampling_CM_round_1/x1/s3cmtf_train.tensor
tensor shape, 39187 4724 240 num entries: 86229600


100%|██████████| 4724/4724 [00:03<00:00, 1487.84it/s]

finished tensor at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/30x30x30_0.0005_1_0_no_ppl_sampling_CM_round_1/x1/s3cmtf_dev.tensor
tensor shape, 39157 4724 240 num entries: 1133760
save config file at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/30x30x30_0.0005_1_0_no_ppl_sampling_CM_round_1/x1/s3cmtf_config.txt
base dir:  ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/30x30x30_0.0005_1_0_no_ppl_sampling_CM_round_1/x1





In [43]:

base_dir='../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/30x30x30_0.0005_1_0_no_ppl_sampling_CM_round_1/x5'

aug_file='x5_augmentation_ref_0.005.txt.emb.pkl'

n_components=240
dim1,dim2,dim3=30,30,30
num_thread=36
user_feature_line=''
context_feature_line=''
pickle_file='../../S3CMTF_code/src_data_200K_May_12/train_ref.txt.emb.pkl'
pickle_file_2=os.path.join(base_dir,aug_file)
pickle_file_3='../../S3CMTF_code/src_data_200K_May_12/val_ref.txt.emb.pkl'

pairs,pairs_2,pairs_3=pca_pairs_three_file(pickle_file,pickle_file_2,pickle_file_3,n_components)


output_a_dir(pairs,pairs_2,pairs_3,base_dir,dim1,dim2,dim3,num_thread,user_feature_line,context_feature_line)
# ./S3CMTF-opt s3cmtf_config.txt  s3cmtf_train.tensor s3cmtf_dev.tensor 30x30x30_0.0005_1_0 2 0.0005 1 0 100
# todo, can we do 30x30x30_0.0005_1_0 0 0.0005 1 0 100?
# ./S3CMTF-opt s3cmtf_config.txt  s3cmtf_train.tensor s3cmtf_dev.tensor 30x30x30_0.0005_1_0 0 0.0005 1 0 100

# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 3. PCA
var %: 0.9612214169066727
# 4. get the new emb
# 4. get the new emb
# 4. get the new emb


100%|██████████| 1099777/1099777 [06:48<00:00, 2692.30it/s]
  6%|▌         | 272/4724 [00:00<00:01, 2714.70it/s]

finished tensor at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/30x30x30_0.0005_1_0_no_ppl_sampling_CM_round_1/x5/s3cmtf_train.tensor
tensor shape, 39187 4724 240 num entries: 263946480


100%|██████████| 4724/4724 [00:01<00:00, 2686.25it/s]

finished tensor at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/30x30x30_0.0005_1_0_no_ppl_sampling_CM_round_1/x5/s3cmtf_dev.tensor
tensor shape, 39157 4724 240 num entries: 1133760
save config file at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/30x30x30_0.0005_1_0_no_ppl_sampling_CM_round_1/x5/s3cmtf_config.txt
base dir:  ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/30x30x30_0.0005_1_0_no_ppl_sampling_CM_round_1/x5





In [44]:

base_dir='../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/30x30x30_0.0005_1_0_no_ppl_sampling_CM_round_2/x2'

aug_file='x2_augmentation_ref_0.002.txt.emb.pkl'

n_components=240
dim1,dim2,dim3=30,30,30
num_thread=36
user_feature_line=''
context_feature_line=''
pickle_file='../../S3CMTF_code/src_data_200K_May_12/train_ref.txt.emb.pkl'
pickle_file_2=os.path.join(base_dir,aug_file)
pickle_file_3='../../S3CMTF_code/src_data_200K_May_12/val_ref.txt.emb.pkl'

pairs,pairs_2,pairs_3=pca_pairs_three_file(pickle_file,pickle_file_2,pickle_file_3,n_components)


output_a_dir(pairs,pairs_2,pairs_3,base_dir,dim1,dim2,dim3,num_thread,user_feature_line,context_feature_line)
# ./S3CMTF-opt s3cmtf_config.txt  s3cmtf_train.tensor s3cmtf_dev.tensor 30x30x30_0.0005_1_0 2 0.0005 1 0 100
# todo, can we do 30x30x30_0.0005_1_0 0 0.0005 1 0 100?
# ./S3CMTF-opt s3cmtf_config.txt  s3cmtf_train.tensor s3cmtf_dev.tensor 30x30x30_0.0005_1_0 0 0.0005 1 0 100

# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 3. PCA
var %: 0.9590693224187832
# 4. get the new emb
# 4. get the new emb
# 4. get the new emb


100%|██████████| 544404/544404 [03:24<00:00, 2659.63it/s]
  6%|▌         | 270/4724 [00:00<00:01, 2697.71it/s]

finished tensor at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/30x30x30_0.0005_1_0_no_ppl_sampling_CM_round_2/x2/s3cmtf_train.tensor
tensor shape, 39187 4724 240 num entries: 130656960


100%|██████████| 4724/4724 [00:01<00:00, 2672.09it/s]

finished tensor at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/30x30x30_0.0005_1_0_no_ppl_sampling_CM_round_2/x2/s3cmtf_dev.tensor
tensor shape, 39157 4724 240 num entries: 1133760
save config file at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/30x30x30_0.0005_1_0_no_ppl_sampling_CM_round_2/x2/s3cmtf_config.txt
base dir:  ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/30x30x30_0.0005_1_0_no_ppl_sampling_CM_round_2/x2





In [50]:

base_dir='../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/aug_pca_30_factor+CMTF_40+rand_30_model_best_13_round_1/x5'

aug_file='x5_augmentation_ref_0.005.txt.emb.pkl'

n_components=240
dim1,dim2,dim3=30,30,30
num_thread=72
user_feature_line='1 ../../../../src_data_200K_May_12/s3cmtf_user_feature_onehot.mat 9041 42709684\n'
context_feature_line='2 ../../../../src_data_200K_May_12/s3cmtf_context_emb.mat 768 30095616\n'
pickle_file='../../S3CMTF_code/src_data_200K_May_12/train_ref.txt.emb.pkl'
pickle_file_2=os.path.join(base_dir,aug_file)
pickle_file_3='../../S3CMTF_code/src_data_200K_May_12/val_ref.txt.emb.pkl'

pairs,pairs_2,pairs_3=pca_pairs_three_file(pickle_file,pickle_file_2,pickle_file_3,n_components)


output_a_dir(pairs,pairs_2,pairs_3,base_dir,dim1,dim2,dim3,num_thread,user_feature_line,context_feature_line)

# ./S3CMTF-opt s3cmtf_config.txt  s3cmtf_train.tensor s3cmtf_dev.tensor 30x30x30_0.0005_2_2 2 0.0005 2 2 100

# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 3. PCA
var %: 0.9621678397578588
# 4. get the new emb
# 4. get the new emb
# 4. get the new emb


100%|██████████| 1097716/1097716 [07:00<00:00, 2610.62it/s]
  6%|▌         | 260/4724 [00:00<00:01, 2596.31it/s]

finished tensor at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/aug_pca_30_factor+CMTF_40+rand_30_model_best_13_round_1/x5/s3cmtf_train.tensor
tensor shape, 39187 4724 240 num entries: 263451840


100%|██████████| 4724/4724 [00:01<00:00, 2588.47it/s]

finished tensor at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/aug_pca_30_factor+CMTF_40+rand_30_model_best_13_round_1/x5/s3cmtf_dev.tensor
tensor shape, 39157 4724 240 num entries: 1133760
save config file at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/aug_pca_30_factor+CMTF_40+rand_30_model_best_13_round_1/x5/s3cmtf_config.txt
base dir:  ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/aug_pca_30_factor+CMTF_40+rand_30_model_best_13_round_1/x5





In [51]:

base_dir='../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/aug_pca_30_factor+CMTF_40+rand_30_model_best_13_round_2/x0.5'

aug_file='x0.5_augmentation_ref_0.0005.txt.emb.pkl'

n_components=240
dim1,dim2,dim3=30,30,30
num_thread=72
user_feature_line='1 ../../../../src_data_200K_May_12/s3cmtf_user_feature_onehot.mat 9041 42709684\n'
context_feature_line='2 ../../../../src_data_200K_May_12/s3cmtf_context_emb.mat 768 30095616\n'
pickle_file='../../S3CMTF_code/src_data_200K_May_12/train_ref.txt.emb.pkl'
pickle_file_2=os.path.join(base_dir,aug_file)
pickle_file_3='../../S3CMTF_code/src_data_200K_May_12/val_ref.txt.emb.pkl'

pairs,pairs_2,pairs_3=pca_pairs_three_file(pickle_file,pickle_file_2,pickle_file_3,n_components)


output_a_dir(pairs,pairs_2,pairs_3,base_dir,dim1,dim2,dim3,num_thread,user_feature_line,context_feature_line)

# ./S3CMTF-opt s3cmtf_config.txt  s3cmtf_train.tensor s3cmtf_dev.tensor 30x30x30_0.0005_2_2 2 0.0005 2 2 100

# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 1. read the pickle file
# 2. build the all_embeddings_nparray
# 3. PCA
var %: 0.9564158708445677
# 4. get the new emb
# 4. get the new emb
# 4. get the new emb


100%|██████████| 266292/266292 [03:21<00:00, 1324.14it/s]
  3%|▎         | 141/4724 [00:00<00:03, 1408.19it/s]

finished tensor at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/aug_pca_30_factor+CMTF_40+rand_30_model_best_13_round_2/x0.5/s3cmtf_train.tensor
tensor shape, 39187 4724 240 num entries: 63910080


100%|██████████| 4724/4724 [00:03<00:00, 1388.80it/s]

finished tensor at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/aug_pca_30_factor+CMTF_40+rand_30_model_best_13_round_2/x0.5/s3cmtf_dev.tensor
tensor shape, 39157 4724 240 num entries: 1133760
save config file at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/aug_pca_30_factor+CMTF_40+rand_30_model_best_13_round_2/x0.5/s3cmtf_config.txt
base dir:  ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/aug_pca_30_factor+CMTF_40+rand_30_model_best_13_round_2/x0.5





# Non-PCA version

In [56]:
def read_two_file(pickle_file,pickle_file_2):
    # 1. read the pickle file
    print('# 1. read the pickle file')
    with open(pickle_file, "rb") as f:
        pairs = pickle.load(f)


    # 1. read the pickle file
    print('# 1. read the pickle file')
    with open(pickle_file_2, "rb") as f:
        pairs_2 = pickle.load(f)

    
    return pairs,pairs_2
def output_tensor(pairs,tensor_file):
    # 5. output the tensor.
    num_entries=0
    max_context_id=0
    max_user_id=0
    with open(tensor_file,'w') as fout:
        for pair in tqdm(pairs):
            context_tensor_id = pair['cid']
            if context_tensor_id > max_context_id:
                max_context_id=context_tensor_id
            
            user_idx = pair['pid']
            if user_idx>max_user_id:
                max_user_id=user_idx
            emb=pair['res_emb']
            emb_dim=len(emb)
            for k in range(emb_dim):
                value = emb[k]
                fout.write('{}\t{}\t{}\t{}\n'.format(int(context_tensor_id),
                                                     int(user_idx),k+1,value))
                num_entries+=1    
    # 6. output the statistics of the tensor
    print('finished tensor at',tensor_file)
    print('tensor shape,',max_context_id,max_user_id,emb_dim,'num entries:',num_entries)
    

In [57]:
def output_a_dir(pairs,pairs_2,base_dir,dim1,dim2,dim3,num_thread,user_feature_line,context_feature_line):
    # output tensor
    tensor_file=os.path.join(base_dir,'s3cmtf_train.tensor')
    max_context_id,max_user_id,emb_dim,train_num_entries = output_tensor(pairs+pairs_2,tensor_file)


    # build the config file
    config_fing=os.path.join(base_dir,'s3cmtf_config.txt')
    with open(config_fing,'w') as fout:
        fout.write('3\n')
        fout.write('{} {} {}\n'.format(max_context_id,max_user_id,emb_dim))
        fout.write('{} {} {}\n'.format(dim1,dim2,dim3))
        fout.write('{}\n'.format(num_thread))
        fout.write('{}\n'.format(train_num_entries))
        fout.write('{}\n'.format(3628032))
        fout.write(user_feature_line)
        fout.write(context_feature_line)
    print('save config file at',config_fing)
    print('base dir: ',base_dir)


In [58]:

base_dir='../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/aug_pca_30_factor+CMTF_40+rand_30_model_best_13_round_1/x2_no_pca'

aug_file='x2_augmentation_ref_0.002.txt.emb.pkl'

dim1,dim2,dim3=30,30,30
num_thread=72
user_feature_line='1 ../../../../src_data_200K_May_12/s3cmtf_user_feature_onehot.mat 9041 42709684\n'
context_feature_line='2 ../../../../src_data_200K_May_12/s3cmtf_context_emb.mat 768 30095616\n'
pickle_file='../../S3CMTF_code/src_data_200K_May_12/train_ref.txt.emb.pkl'
pickle_file_2=os.path.join(base_dir,aug_file)
pickle_file_3='../../S3CMTF_code/src_data_200K_May_12/val_ref.txt.emb.pkl'


pairs,pairs_2=read_two_file(pickle_file,pickle_file_2)

output_a_dir(pairs,pairs_2,base_dir,dim1,dim2,dim3,num_thread,user_feature_line,context_feature_line)

# ./S3CMTF-opt s3cmtf_config.txt  s3cmtf_train.tensor s3cmtf_dev.tensor 30x30x30_0.0005_1_4 2 0.0005 1 4 100

# 1. read the pickle file
# 1. read the pickle file


100%|██████████| 540284/540284 [16:12<00:00, 555.56it/s]


finished tensor at ../../S3CMTF_code/src_data_200K_May_12/emb-pkl-files/aug_pca_30_factor+CMTF_40+rand_30_model_best_13_round_1/x2_no_pca/s3cmtf_train.tensor
tensor shape, 39187 4724 768 num entries: 414938112


TypeError: cannot unpack non-iterable NoneType object

In [None]:
500000000