In [1]:
def recall5(answer_df, submission_df):
    """
    Calculate recall@5 for given dataframes.

    Parameters:
    - answer_df: DataFrame containing the ground truth
    - submission_df: DataFrame containing the predictions

    Returns:
    - recall: Recall@5 value
    """

    primary_col = answer_df.columns[0]
    secondary_col = answer_df.columns[1]

    # Check if each primary_col entry has exactly 5 secondary_col predictions
    prediction_counts = submission_df.groupby(primary_col).size()
    if not all(prediction_counts == 5):
        raise ValueError(f"Each {primary_col} should have exactly 5 {secondary_col} predictions.")


    # Check for NULL values in the predicted secondary_col
    if submission_df[secondary_col].isnull().any():
        raise ValueError(f"Predicted {secondary_col} contains NULL values.")

    # Check for duplicates in the predicted secondary_col for each primary_col
    duplicated_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].duplicated().any())
    if duplicated_preds.any():
        raise ValueError(f"Predicted {secondary_col} contains duplicates for some {primary_col}.")


    # Filter the submission dataframe based on the primary_col present in the answer dataframe
    submission_df = submission_df[submission_df[primary_col].isin(answer_df[primary_col])]

    # For each primary_col, get the top 5 predicted secondary_col values
    top_5_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].head(5).tolist()).to_dict()

    # Convert the answer_df to a dictionary for easier lookup
    true_dict = answer_df.groupby(primary_col).apply(lambda x: x[secondary_col].tolist()).to_dict()
    # answer_df가 val입니다.


    individual_recalls = []
    for key, val in true_dict.items():
        if key in top_5_preds:
            correct_matches = len(set(true_dict[key]) & set(top_5_preds[key]))  # 두 set에 공통으로 있는것의 길이이다.
            # true_dict는 정답이고, pred은 예측값이다.  근데 정답은 1개만 주어졌으니까 correct_matches는 최대 1이다.
            individual_recall = correct_matches / min(len(val), 5) # 공정한 평가를 가능하게 위하여 분모(k)를 'min(len(val), 5)' 로 설정함
            # min(len(val), 5)는 입력인자 2개중 더 작은값을 내보냄
            individual_recalls.append(individual_recall)
            # 그래서 정답을 맞추면 0.2라는 값이 저장된다.

# 그거의 평균값
    recall = np.mean(individual_recalls)
    return recall

In [2]:
min(10, 5)

5

In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

In [4]:
import numpy as np
import pandas as pd

from tqdm.auto import tqdm
from collections import defaultdict
from sklearn.decomposition import TruncatedSVD, NMF, SparsePCA
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
# csv_path= '/content/drive/MyDrive/dacon/csv'
# result_path='/content/drive/MyDrive/dacon/result'

In [6]:
# apply_train_df = pd.read_csv(csv_path +'/apply_train.csv')
# resume_df = pd.read_csv(csv_path +'/resume.csv')
# resume_certificate_df = pd.read_csv(csv_path +'/resume_certificate.csv')
# resume_education_df = pd.read_csv(csv_path +'/resume_education.csv')
# resume_language_df = pd.read_csv(csv_path +'/resume_language.csv')
# recruitment_df = pd.read_csv(csv_path +'/recruitment.csv')
# company_df = pd.read_csv(csv_path +'/company.csv')
# sample_submission_df = pd.read_csv(csv_path +'/sample_submission.csv')

In [7]:
apply_train_df = pd.read_csv('apply_train.csv')
resume_df = pd.read_csv('resume.csv')
resume_certificate_df = pd.read_csv('resume_certificate.csv')
resume_education_df = pd.read_csv('resume_education.csv')
resume_language_df = pd.read_csv('resume_language.csv')
recruitment_df = pd.read_csv('recruitment.csv')
company_df = pd.read_csv('company.csv')
sample_submission_df = pd.read_csv('sample_submission.csv')

In [8]:
apply_train = apply_train_df

In [9]:
#학습, 검증 분리
train, val = [], []
apply_train_groupby = apply_train.groupby('resume_seq')['recruitment_seq'].apply(list)
for uid, iids in zip(apply_train_groupby.index.tolist(), apply_train_groupby.values.tolist()):
    for iid in iids[:-1]:
        train.append([uid,iid])
    val.append([uid, iids[-1]])

In [10]:
len(train)

49464

In [11]:
# 위에 셀을 실행하면 train은 [지원자, 지원기업]이 반복되는 리스트이다.
# val은 8482개, train은 49464의 길이를 갖는다. 두개의 총합은 57946(apply_train의 길이)

In [12]:
train = pd.DataFrame(train, columns=['resume_seq', 'recruitment_seq'])  #list를 dataframe으로 변경해주는 코드이다.
val = pd.DataFrame(val, columns=['resume_seq', 'recruitment_seq'])
pred = apply_train.copy()       #apply_train을 복사함,  pred의 shpae은 (57946,2)이다.

# 통합한 csv 불러오는 위치

In [14]:
res_all = pd.read_csv('resume_all.csv')
res_all_backup = res_all.copy()
rec_all = pd.read_csv('recruit_all_ch.csv')
rec_all_backup = rec_all.copy()

In [15]:
train= pd.merge(train, res_all, on='resume_seq', how='left')

In [17]:
train= pd.merge(train, rec_all, on='recruitment_seq', how='left')

In [18]:
train_user_item_matrix = train.groupby(['resume_seq', 'recruitment_seq']).size().unstack(fill_value=0)
pred_user_item_matrix = pred.groupby(['resume_seq', 'recruitment_seq']).size().unstack(fill_value=0)

In [19]:
# train test 나누면서 누락된 컬럼이 있어 맞춰주는 작업 (6691) -> (6695)
apply_train2 = apply_train.groupby(['resume_seq', 'recruitment_seq']).size().unstack(fill_value=0)
total_columns = apply_train2.columns.tolist()

for col in total_columns:
    if col not in train_user_item_matrix.columns:
        train_user_item_matrix[col] = 0

In [20]:
# merge하긴 했는데 아직 사용하진 않음, 각각연산하여 더할예정
m_train_user_item_matrix=pd.merge(train_user_item_matrix,res_all, on='resume_seq', how='left')
m_train_user_item_matrix.set_index('resume_seq', inplace=True)

In [21]:
m_train_user_item_matrix_T=pd.merge(train_user_item_matrix.T,rec_all, on='recruitment_seq', how='left')
m_train_user_item_matrix_T.set_index('recruitment_seq', inplace=True)

In [22]:
m_train_user_item_matrix

Unnamed: 0_level_0,R00001,R00002,R00003,R00004,R00005,R00006,R00007,R00008,R00009,R00010,...,univ_type_seq2,univ_transfer,univ_location,univ_major_type,univ_score,language,score,job,certificate_cnt,career
resume_seq,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U00001,0,0,0,0,0,0,0,0,0,0,...,5,0,17,9,4,5,3,5,0,6
U00002,0,0,0,0,0,0,0,0,0,0,...,5,0,3,8,1,5,3,5,1,4
U00003,0,0,0,0,0,0,0,0,0,0,...,5,0,17,4,1,5,3,5,3,5
U00004,0,0,0,1,0,0,0,0,0,0,...,5,0,5,3,3,0,1,0,3,6
U00005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,10,4,0,0,4,0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
U08478,0,0,0,0,0,0,0,0,0,0,...,5,0,5,19,4,5,3,0,0,2
U08479,0,0,0,0,0,0,0,0,0,0,...,6,0,3,9,4,5,3,5,1,2
U08480,0,0,0,0,0,0,0,0,0,0,...,5,0,10,9,2,5,3,3,4,6
U08481,0,0,0,0,0,0,0,0,0,0,...,5,0,5,9,4,5,3,0,1,6


In [23]:
m_train_user_item_matrix.shape ,m_train_user_item_matrix_T.shape #6715과 8497

((8482, 6719), (6695, 8497))

In [36]:
train_user_similarity = cosine_similarity(train_user_item_matrix)       #shape (8482,8482) type np.ndarray
train_item_similarity = cosine_similarity(train_user_item_matrix.T)     #shape (6691,6691)

# pred_user_similarity = cosine_similarity(pred_user_item_matrix)
# pred_item_similarity = cosine_similarity(pred_user_item_matrix.T)

# 이위에는 볼필요없습니다.

In [37]:
# res_all의 중요 feature라고 생각되는것(분석 안했고 그냥 뇌피셜)
# degree graduate_date hope_salary last_salary job_code_seq1 job career certificate_cnt

# rec_all의 중요 feature라고 생각되는것(분석 안했고 그냥 뇌피셜)
#address_seq1 recruit_code education qualifications text_keyword company_size major_task
res_all= res_all_backup.copy()
rec_all= rec_all_backup.copy()
res_all.set_index('resume_seq', inplace=True)
rec_all.set_index('recruitment_seq', inplace=True)

# 저희는 res_all 안써도 됩니다.
# res_all2= res_all[['degree', 'graduate_date', 'hope_salary', 'last_salary', 'job' ,'career' ]].copy()
# res_all.drop(columns=['degree', 'graduate_date', 'hope_salary', 'last_salary', 'job' ,'career' ], inplace= True)
nice_rec_cols=['address_seq1' ,'recruit_code' ,'education', 'qualifications' ,'major_task']
rec_all2= rec_all[nice_rec_cols].copy()
# rec_all3= rec_all[['recruit_code', 'employee','company_size','supply_kind']].copy()
rec_all.drop(columns=nice_rec_cols, inplace= True)

In [38]:
train_user_similarity1 = cosine_similarity(train_user_item_matrix)      #shape (8482,8482) type np.ndarray\
train_user_similarity2= cosine_similarity(res_all)  

train_item_similarity1 = cosine_similarity(train_user_item_matrix.T)     #shape (6691,6691)
train_item_similarity2 = cosine_similarity(rec_all)
train_item_similarity3 = cosine_similarity(rec_all2)
# train_item_similarity4 = cosine_similarity(rec_all3)

In [39]:
train_item_similarity2 = cosine_similarity(rec_all)

In [40]:
rec_all2

Unnamed: 0_level_0,address_seq1,recruit_code,education,qualifications,major_task
recruitment_seq,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
R00001,5.0,0,2,1,2
R00002,3.0,3,3,1,2
R00003,3.0,0,3,2,2
R00004,3.0,3,3,1,2
R00005,3.0,3,3,1,2
...,...,...,...,...,...
R06691,3.0,2,3,1,2
R06692,3.0,4,3,2,2
R06693,5.0,2,4,1,2
R06694,3.0,0,4,1,2


In [41]:
rec_all

Unnamed: 0_level_0,address_seq2,address_seq3,career_end,career_start,text_keyword,check_box_keyword_cnt,company_type_seq,supply_kind,employee,company_size
recruitment_seq,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
R00001,0.0,0.0,0,0,1,7,5.0,201.0,631.0,2
R00002,0.0,0.0,0,0,1,3,2.0,201.0,160.0,3
R00003,0.0,0.0,0,0,1,4,0.0,0.0,0.0,4
R00004,0.0,0.0,0,0,1,2,2.0,402.0,500.0,2
R00005,0.0,0.0,0,0,1,2,0.0,0.0,0.0,4
...,...,...,...,...,...,...,...,...,...,...
R06691,0.0,0.0,0,0,1,3,0.0,0.0,0.0,4
R06692,0.0,0.0,0,0,1,2,4.0,402.0,150.0,3
R06693,0.0,0.0,0,0,1,2,0.0,0.0,0.0,4
R06694,0.0,0.0,0,0,1,7,0.0,0.0,0.0,4


In [42]:
# 두 유사도의 값을 비율을 맞춰주는과정
e=0
b= train_user_similarity1.mean()/ train_user_similarity2.mean()  
c = train_item_similarity1.mean() / train_item_similarity2.mean()
d = train_item_similarity1.mean() / train_item_similarity3.mean()
# e = train_item_similarity1.mean() / train_item_similarity4.mean()

In [43]:
pd.DataFrame(train_item_similarity3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6685,6686,6687,6688,6689,6690,6691,6692,6693,6694
0,1.000000,0.788241,0.908108,0.788241,0.788241,0.865768,0.714006,0.929758,0.661917,0.929758,...,0.858128,0.788241,0.788241,0.705882,0.877527,0.858128,0.714496,0.921635,0.876714,0.788241
1,0.788241,1.000000,0.832050,1.000000,1.000000,0.830868,0.990742,0.847791,0.777050,0.847791,...,0.986600,1.000000,1.000000,0.970143,0.979912,0.986600,0.981981,0.950000,0.839146,1.000000
2,0.908108,0.832050,1.000000,0.832050,0.832050,0.990044,0.753689,0.981433,0.693853,0.981433,...,0.905822,0.832050,0.832050,0.706306,0.878054,0.905822,0.786796,0.915255,0.966755,0.832050
3,0.788241,1.000000,0.832050,1.000000,1.000000,0.830868,0.990742,0.847791,0.777050,0.847791,...,0.986600,1.000000,1.000000,0.970143,0.979912,0.986600,0.981981,0.950000,0.839146,1.000000
4,0.788241,1.000000,0.832050,1.000000,1.000000,0.830868,0.990742,0.847791,0.777050,0.847791,...,0.986600,1.000000,1.000000,0.970143,0.979912,0.986600,0.981981,0.950000,0.839146,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6690,0.858128,0.986600,0.905822,0.986600,0.986600,0.904534,0.955317,0.922958,0.784047,0.922958,...,1.000000,0.986600,0.986600,0.924138,0.984732,1.000000,0.950262,0.979796,0.913547,0.986600
6691,0.714496,0.981981,0.786796,0.981981,0.981981,0.778962,0.988332,0.772187,0.744438,0.772187,...,0.950262,0.981981,0.981981,0.979124,0.954030,0.950262,1.000000,0.894693,0.760639,0.981981
6692,0.921635,0.950000,0.915255,0.950000,0.950000,0.910877,0.905822,0.943629,0.697450,0.943629,...,0.979796,0.950000,0.950000,0.873128,0.964836,0.979796,0.894693,1.000000,0.929516,0.950000
6693,0.876714,0.839146,0.966755,0.839146,0.839146,0.985245,0.760117,0.989803,0.665516,0.989803,...,0.913547,0.839146,0.839146,0.688847,0.856349,0.913547,0.760639,0.929516,1.000000,0.839146


In [44]:
rec_all2.head(30)

Unnamed: 0_level_0,address_seq1,recruit_code,education,qualifications,major_task
recruitment_seq,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
R00001,5.0,0,2,1,2
R00002,3.0,3,3,1,2
R00003,3.0,0,3,2,2
R00004,3.0,3,3,1,2
R00005,3.0,3,3,1,2
R00006,3.0,0,4,2,2
R00007,3.0,4,3,1,2
R00008,3.0,0,3,1,2
R00009,3.0,3,2,1,8
R00010,3.0,0,3,1,2


In [45]:
# recruit_all의 feature를 사용하여 유사도 측정
# 유사도를 측정하는 의미가 없음
# b=pd.DataFrame(train_user_similarity2)
# b.sort_values(by=0,ascending=False).head(60)

In [46]:
train_item_similarity1.mean()

0.0020824550320791275

# 연산구간 <br>
아래를 새로 실행때마다 위에 셀들은 실행시킬 필요 없습니다<br>
1번 도는데 50초 걸림

In [47]:
pd.DataFrame(train_item_similarity)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6685,6686,6687,6688,6689,6690,6691,6692,6693,6694
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.100504,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6690,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6691,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6692,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
# 현재 alpha값은 0으로 뒀고 추천 공고를 뽑을때 train_user_predicted_scores에 alpha를 곱했습니다.
# 이것은 train_user_predicted_scores을 사용하지 않겠다는 것이고 train_item_predicted_scores만을 사용한다는 의미입니다.
# 우리의 목적은 cosine_similarity(rec_all)을 잘 설계하기 위함이니까 이것으로부터 생성된 train_item_predicted_scores만을 고려
# alpha beta gamma 등의 값은 모두 기본값으로 두시고 진행하시면 됩니다.
# 값을 바꿔준다면 그건 파라미터 튜닝작업을 하시는건데 해보셔도 괜찮습니다.
# 현재 alpha의 len를 1로 두었기때문에 for문은 한번 돕니다.
alpha = [0] 
beta=0
gamma =0
delta =0
epsil =0
# beta = [b, b, b*0.15, b*0.1, b*0.05, b*0.08]
# gamma= [c, c, c*0.15, c*0.1, c*0.05, c*0.08]

# delta= [d, d, d*0.5,d*0.1, d*0.05]
# epsil= [e, e*0.5,e* 0.1, e*0.05, e*0.01]
recall_score=[]
best_score = 0

for i in range(len(alpha)):
    train_recommendations = []

    #train_user_similarity = (train_user_similarity1 + train_user_similarity2*beta[i]) 
    train_item_similarity = (train_item_similarity2)
    #                         train_item_similarity3*delta[i] + train_item_similarity4* epsil[i] ) /4
    #train_item_similarity = (train_item_similarity1 + train_item_similarity2*gamma[i])

    #train_user_predicted_scores = train_user_similarity.dot(train_user_item_matrix)
    train_item_predicted_scores = train_user_item_matrix.dot(train_item_similarity)


    for idx, user in tqdm(enumerate(train_user_item_matrix.index)):     # idx는 index값을 갖는데 0~8481이다. user는 유저의 이름
        applied_jobs = set(train_user_item_matrix.loc[user][train_user_item_matrix.loc[user] == 1].index)   # 지원한 기업을 뽑아낸다.

        # 해당 사용자의 추천 점수 (높은 점수부터 정렬)
        sorted_job_indices = (train_item_predicted_scores.loc[user].values).argsort()[::-1]
        #sorted_job_indices = (train_item_predicted_scores.loc[user].values + train_user_predicted_scores[idx] * alpha[i]).argsort()[::-1]
        recommended_jobs = [job for job in train_user_item_matrix.columns[sorted_job_indices] if job not in applied_jobs][:5]

        for job in recommended_jobs:
            train_recommendations.append([user, job])
    
    val_prediction = pd.DataFrame(train_recommendations, columns=['resume_seq', 'recruitment_seq'])
    score=round(recall5(val,val_prediction),6)
    recall_score.append(score)
    if recall_score[i] > best_score:
        best_score= recall_score[i]
        best_parameter = [alpha[i], beta[i], gamma[i]]
        best_index= i
    print(f'점수{score}, alpha,beta,gamma,delta,epsilon 값: {alpha[i]}, {round(beta[i],5)}, {round(gamma[i],5)},{round(delta[i],5)},{round(epsil[i],5)}')

0it [00:00, ?it/s]

NameError: name 'beta' is not defined

In [None]:
''' 결과 기록하는곳


'''

# 밑에는 제출용이라 실행 안시켜도 됩니다.

In [None]:
# 멈추기
a

In [None]:
pred_user_similarity = cosine_similarity(pred_user_item_matrix)
pred_item_similarity = cosine_similarity(pred_user_item_matrix.T)

pred_user_similarity = (pred_user_similarity + train_user_similarity2*best_parameter[1]) /2
pred_item_similarity = (pred_item_similarity + train_item_similarity2*best_parameter[2]) /2

In [None]:
pred_user_predicted_scores = pred_user_similarity.dot(pred_user_item_matrix)
pred_item_predicted_scores = pred_user_item_matrix.dot(pred_item_similarity)

In [None]:
alpha = 0.98
pred_recommendations = []
for idx, user in tqdm(enumerate(pred_user_item_matrix.index)):
    applied_jobs = set(pred_user_item_matrix.loc[user][pred_user_item_matrix.loc[user] == 1].index)

    sorted_job_indices = (pred_item_predicted_scores.loc[user].values * best_parameter[0] + pred_user_predicted_scores[idx]).argsort()[::-1]
    recommended_jobs = [job for job in pred_user_item_matrix.columns[sorted_job_indices] if job not in applied_jobs][:5]

    for job in recommended_jobs:
        pred_recommendations.append([user, job])

In [None]:
top_recommendations = pd.DataFrame(pred_recommendations, columns=['resume_seq', 'recruitment_seq'])
top_recommendations.to_csv('./baseline_add_item_0.98.csv', index=False)

In [None]:
best_parameter