In [1]:
import json
import numpy as np
import pandas as pd
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=False)
from joblib import Parallel, delayed
import multiprocessing

def groupApplyParallel(dfGrouped, func_dict):
    # groupApplyParallel(df.groupby('abc'), {'add':lambda df: df['a']=df['b']+df['c']})
    def multi_agg(group, func_dict):
        for name, func in func_dict.items():
            df[name] = func(group)
    retLst = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(multi_agg)(group, func_dict) for name, group in dfGrouped)
    return pd.concat(retLst)

def quant(n):
    def quantile_(x):
        return np.quantile(x, n)
    quantile_.__name__ = 'quantile_%s' % n
    return quantile_

def groupAggParallel(dfGrouped, func_dict):
    # groupAggParallel(df.groupby('abc'), {'mean':np.mean, 'count':len})
    def multi_agg(group, group_name, func_list):
        res = list(group_name) if type(group_name) in (list, tuple) else [group_name]
        for func in func_list:
            res.append(func(group))
        return res
    retLst = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(multi_agg)(group, group_name, func_dict.values()) for group_name, group in dfGrouped)
    print(retLst)
    retLst = pd.DataFrame(retLst, columns=list(dfGrouped.grouper.result_index.names)+list(func_dict.keys()))
    return retLst

from tqdm import tqdm as tqdm_notebook
from tqdm import tqdm
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)
import gc
import os

import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.linear_model import LinearRegression,LogisticRegression

import seaborn as sns
import matplotlib.pyplot as plt

from chinese_calendar import is_workday
import datetime
from functools import partial
from collections import defaultdict
import random
import math

RANDOM_SEED = 1
random.seed(RANDOM_SEED)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [None]:
# 读取数据
PREFIX = './'
#df_train1 = pd.read_csv(PREFIX+'/input/AAAI2023Competition/train.csv')
#df_test1 = pd.read_csv(PREFIX+'/input/AAAI2023Competition/test.csv')
df_train1 = pd.read_feather(PREFIX+'/input/df_train.feather')
df_test1 = pd.read_feather(PREFIX+'/input/df_test.feather')
df1 = pd.concat([df_train1, df_test1], axis=0)
df1['question'] = df1['question'].apply(lambda x: str(x))
#df1 = df1[df1['is_repeat']==0]
df1_group = df1[['uid', 'question']].groupby('uid')['question'].apply(','.join)


try:
    import gensim
except:
    !pip install gensim
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec
from time import time
import multiprocessing

class MonitorCallback(CallbackAny2Vec):
    def __init__(self):
        self.loss = 0
        self.epoch = 1

    def on_epoch_end(self, model):
#         print("Epoch No:" + str(self.epoch))
        self.epoch += 1
        loss = model.get_latest_training_loss()
#         print("Model loss: ", loss - self.loss)  # print loss
        self.loss = loss
        
def train(seqs, iters=50, size=100, window=5, negative=5, min_count=1, ns_exponent=0.75, sg=1, hs=0):
    t = time()
    monitor = MonitorCallback()  # monitor with demo words
    model = Word2Vec(seqs, epochs=iters,
                     vector_size=size, window=window, min_count=min_count,
                     sg=sg, hs=hs, negative=negative, ns_exponent=ns_exponent,
                     workers=cores-1, compute_loss=True, callbacks=[monitor])
    print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))
    return model


def evaluate(wv, df):
    wv_norm = {id: wv[id]/np.linalg.norm(wv[id]) for id in wv.vocab}

    def calc_ndgc(df):
        positives = [wv_norm[positive] for positive in df['positives']]
        target_id = df['target_id']

        ranks = [id for (id, sim) in wv.most_similar(
            positive=positives, topn=10)]
        if target_id in ranks:
            hr = 1
            r = ranks.index(target_id)
            ndgc = 1.0 / math.log(r+2, 2)
        else:
            hr = 0
            ndgc = 0
        return pd.Series([hr, ndgc])

    return df.apply(calc_ndgc, axis=1).mean()

cores = multiprocessing.cpu_count()


def get_emb_df_wv(wv, group=False):
    w2v_emb = pd.DataFrame([[int(word), ','.join(map(str, wv.get_vector(word)))] for index, word in enumerate(wv.index_to_key)])
    w2v_emb.columns = ['question', 'embedding']
    for i in range(len(w2v_emb.iloc[0]['embedding'].split(','))):
        w2v_emb['question_emb'+str(i)] = w2v_emb['embedding'].apply(lambda x:float(x.split(',')[i]))
    if group:
        lstm_emb = pd.read_csv(PREFIX+'input/question_lstm_emb.csv')
        lstm_emb = lstm_emb.drop(['question_emb'+str(i) for i in range(8)], axis=1)
        lstm_emb = lstm_emb.merge(w2v_emb[['question']+['question_emb'+str(i) for i in range(5)]], how='left', on='question')
        lstm_emb['concept'] = lstm_emb['concept_routes'].apply(lambda x:int(literal_eval(x)[0].split('----')[-1]))
        concept_emb = lstm_emb[['concept']+['question_emb'+str(i) for i in range(5)]].groupby('concept').agg('mean')
        lstm_emb = lstm_emb.merge(concept_emb, on='concept', how='left', suffixes=(None,'_group'))
        print("Shape of lstm_emb:", lstm_emb.shape)
        return lstm_emb
    return w2v_emb

# def test_emb(df_tr, df_val, emb, length=5, suffix='_group'):
#     df_tr = df_tr.merge(emb[['question']+['question_emb'+str(i)+suffix for i in range(length)]], how='left', on='question')
#     df_val = df_val.merge(emb[['question']+['question_emb'+str(i)+suffix for i in range(length)]], how='left', on='question')
#     gc.collect()
#     res = get_result(lgb_params, df_tr, df_val, cols+['question_emb'+str(i)+suffix for i in range(length)], 'real_response', [metric,metric2], verbose=0, weight_func=weight_func)
#     # 清理数据
#     df_tr = df_tr.drop(['question_emb'+str(i)+suffix for i in range(length)], axis=1)
#     df_val = df_val.drop(['question_emb'+str(i)+suffix for i in range(length)], axis=1)
#     assert [col for col in df_tr.columns if 'question_emb' in col] == []
#     return res 


config = {'iters':10, 'size':5, 'window':256, 'negative':5, 'sg':1}
model = train([x.split(',') for x in df1_group], **config)
wv = model.wv
w2v_emb = get_emb_df_wv(wv)
#score = test_emb(df_tr, df_val, w2v_emb, length=config['size'], suffix='')


In [9]:
w2v_emb.to_csv('input/question_w2v_emb_all.csv')

Unnamed: 0,question,embedding,question_emb0,question_emb1,question_emb2,question_emb3,question_emb4
0,182,"0.21612895,0.124061614,0.94540304,-0.35554838,...",0.216129,0.124062,0.945403,-0.355548,-0.010761
1,313,"-0.17996608,0.7829615,1.2150885,-0.38109714,-0...",-0.179966,0.782961,1.215089,-0.381097,-0.058842
2,181,"0.30566648,0.21535926,0.8359815,-0.44348496,-0...",0.305666,0.215359,0.835982,-0.443485,-0.174992
3,372,"-1.3390198,0.8790246,1.7617677,-0.38630235,-0....",-1.339020,0.879025,1.761768,-0.386302,-0.240272
4,409,"-3.849838,2.1938562,4.2878475,-1.0281198,-1.49...",-3.849838,2.193856,4.287847,-1.028120,-1.490355
...,...,...,...,...,...,...,...
7647,5964,"-1.2107998,-1.8678441,1.410139,-0.8014248,0.35...",-1.210800,-1.867844,1.410139,-0.801425,0.358806
7648,5965,"-1.2183412,-1.8519108,1.2442461,-0.80626017,0....",-1.218341,-1.851911,1.244246,-0.806260,0.358229
7649,5967,"-1.1887653,-1.8129925,1.3449147,-0.8082158,0.3...",-1.188765,-1.812993,1.344915,-0.808216,0.387356
7650,7206,"-0.88806415,2.5467634,-0.214021,-0.7542569,1.3...",-0.888064,2.546763,-0.214021,-0.754257,1.324391
