In [1]:
import pandas as pd
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import csv
import itertools
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.doc2vec import Doc2Vec

# Functions

In [2]:
def correctly_read_csv(fname):
    prep_df = pd.read_csv(fname, converters={"tokens_rep": literal_eval, "tokens": literal_eval, "reference": literal_eval})
    return prep_df


def add_extra_keywords(df, key_list, n_repeat):
    """
    Takes df and keyword list, adds n_repeat occurances of keyword to new column when found in doc
    """
    df['tokens_rep_extra'] = df['tokens_rep']
    for ind in df.index:
        match_list = list()
        for keyword in key_list:
            # Making everything into a bock of text to find broken up keywords
            if keyword in "".join(df['tokens_rep'][ind]):
                match_list.append(keyword)
        if len(match_list) > 0:
            i = 0
            while i < n_repeat:
                df['tokens_rep_extra'][ind] = df['tokens_rep_extra'][ind] + match_list
                i += 1
    return df['tokens_rep_extra']

def add_extra_keywords_temp(df, key_list, n_repeat):
    """
    Takes df and keyword list, adds n_repeat occurances of keyword to new column when found in doc
    """
    df['tokens_rep_extra'] = df['tokens_rep']
    for ind in df.index:
        match_list = list()
        tokens_to_raw = "".join(df['tokens_rep'][ind])
        for keyword in key_list:
            n_instances = tokens_to_raw.count(keyword)
            i = 0
            while i < n_instances:
                match_list.append(keyword)
                i += 1
        if len(match_list) > 0:
            i = 0
            while i < n_repeat:
                df['tokens_rep_extra'][ind] = df['tokens_rep_extra'][ind] + match_list
                i += 1
    return df['tokens_rep_extra']

def only_keywords(df, key_list):
    """
    Will keep only keywords
    Right now is returning some blanks, doesn't work on all data sets
    """
    df['tokens_rep_only'] = ""
    for ind in df.index:
        match_list = list()
        for keyword in key_list:
            # Making everything into a bock of text to find broken up keywords
            if keyword in "".join(df['tokens_rep'][ind]):
                count = 0
                while count < "".join(df['tokens_rep'][ind]).count(keyword):
                    match_list.append(keyword)
                    count += 1
        if len(match_list) > 0:
            df['tokens_rep_only'][ind] = match_list
        else:
            print(df['doc_index'][ind])
            df['tokens_rep_only'][ind] = ['this is bad']
    return df['tokens_rep_only']

def fake_tokenizer(tokens):
    return tokens

def get_all_rep_token_strings(token_list):
    all_rep_token_strings = []
    for d in token_list:
        all_rep_token_strings.append(''.join(d))
    return all_rep_token_strings

def get_key_doc_dict(all_rep_token_strings, doc_ind_list):
    # key word, list of document ids that contain it
    key_doc_dict = {}
    for k in all_key_list:
        k = k.strip()
        if k:
            k_list = []
            for i in range(len(all_rep_token_strings)):
                if k in all_rep_token_strings[i]:
                    k_list.append(doc_ind_list[i])
            if k_list:
                key_doc_dict[k] = k_list
    return key_doc_dict

def get_doc_to_keyword_lst_dict(key_doc_dict):
    # dict with doc_id as key and list of keywords in it
    doc_to_keyword_lst_dict = {}
    for k,lst in key_doc_dict.items():
        for doc_id in lst:
            if doc_id in doc_to_keyword_lst_dict:
                doc_to_keyword_lst_dict[doc_id].append(k)
            else:
                doc_to_keyword_lst_dict[doc_id] = [k]
    return doc_to_keyword_lst_dict

def get_doc_to_docs_with_overlap_dict(doc_to_keyword_lst_dict):
    # get list of lists of overlapping keys per document
    doc_to_docs_with_overlap_dict = {}
    for doc_id, keywords_lst in doc_to_keyword_lst_dict.items():
        keyword_set = set(keywords_lst)
        for doc_id_next, keywords_lst_next in doc_to_keyword_lst_dict.items():
            if doc_id_next != doc_id:
                keyword_set_next = set(keywords_lst_next)
                intersection = keyword_set.intersection(keyword_set_next)
                if intersection:
                    if doc_id in doc_to_docs_with_overlap_dict:
                        doc_to_docs_with_overlap_dict[doc_id][doc_id_next] = intersection
                    else:
                        doc_to_docs_with_overlap_dict[doc_id] = {doc_id_next: intersection}
    return doc_to_docs_with_overlap_dict

def rank_most_similar_documents(input_doc_embedding, all_docs_embeddings):
    """
    Get top n matched documents plus their indexes
    Return list of tuples where first position in each tuple is index and second position is the probability
    """
    distances = cosine_similarity(input_doc_embedding, all_docs_embeddings)[0]
    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1], reverse=True)
    res_with_doc_index = []
    return results

def rank_most_similar_documents_docid(input_doc_embedding, all_docs_embeddings, doc_ids_all_embeddings):
    """
    Get top n matched documents plus their indexes
    Return list of tuples where first position in each tuple is index and second position is the probability
    """
    distances = cosine_similarity(input_doc_embedding, all_docs_embeddings)[0]
    results = zip(doc_ids_all_embeddings, distances)
    results = sorted(results, key=lambda x: x[1], reverse=True)
    res_with_doc_index = []
    return results

def rank_all_embeddings(all_embeddings, doc_index_list):
    rank_probs_all = []
    rank_inds_all = []
    for i in range(len(doc_index_list)):
        rank_results = rank_most_similar_documents(all_embeddings[i], all_embeddings)
        rank_probs = []
        rank_inds = []
        for ind,prob in rank_results:
            if ind != i:
                rank_probs.append(prob)
                rank_inds.append(doc_index_list[ind])
        rank_probs_all.append(rank_probs)
        rank_inds_all.append(rank_inds)
    return rank_probs_all, rank_inds_all

def rank_all_keyword_overlap_embeddings(search_df, doc_overlap_dict, all_embeds, id_to_ind):
    rank_probs_all = []
    rank_inds_all = []
    for i in range(len(search_df['doc_index'])):
        # get list of embeddings for just the input document
        if search_df['doc_index'][i] in doc_overlap_dict:
            overlap_inner_dict = doc_overlap_dict[search_df['doc_index'][i]]
            overlap_docs = overlap_inner_dict.keys()
            overlap_embeds = []
            for doc_id in overlap_docs:
                the_embed = all_embeds[id_to_ind[doc_id]].toarray().flatten()
                overlap_embeds.append(the_embed)
            if overlap_embeds:
                overlap_embeds = np.stack(overlap_embeds)
                #print(overlap_embeds.shape)
                rank_results = rank_most_similar_documents_docid(all_embeds[i], overlap_embeds, overlap_docs)
                rank_probs = []
                rank_inds = []
                for d_id,prob in rank_results:
                    rank_probs.append(prob)
                    rank_inds.append(d_id)
                rank_probs_all.append(rank_probs)
                rank_inds_all.append(rank_inds)
            else:
                rank_probs_all.append([])
                rank_inds_all.append([])
        else:
            rank_probs_all.append([])
            rank_inds_all.append([])
    return rank_probs_all, rank_inds_all

def create_guess_output(rank_probs, rank_inds, doc_ids, THRESH):
    """
    Returns a df that contains doc_id and matching reference docs
    THRESH controls threshold of match probability   
    """
    probs_to_save = []
    inds_to_save = []
    for i in range(len(rank_inds)):
        this_ind_probs_to_save = []
        this_ind_inds_to_save = []
        this_ind = 0
        prob = rank_probs[i][this_ind]
        while prob >= THRESH:
            this_ind_probs_to_save.append(prob)
            this_ind_inds_to_save.append(rank_inds[i][this_ind])
            this_ind += 1
            prob = rank_probs[i][this_ind]
        probs_to_save.append(this_ind_probs_to_save)
        inds_to_save.append(this_ind_inds_to_save)

    # Creating output df with index and match
    dict_list = list()
    for i in range(len(doc_ids)):
        if len(inds_to_save[i]) == 0:
            pass
        else:
            for ind in inds_to_save[i]:
                new_row = {'Test' :doc_ids[i], "Reference": ind}
                dict_list.append(new_row)
    output_df = pd.DataFrame(dict_list)
    return output_df

def check_training_results(training_labels_df, guesses_df):
    # This is stupid but it works
    # Returns f1_measure (AI cup eval metric)
    output_touples = [tuple(r) for r in guesses_df.to_numpy()]
    training_label_touples = [tuple(r) for r in training_labels_df.to_numpy()]
    n_correct_guesses = len(set(output_touples) & set(training_label_touples))
    n_correct_guesses
    n_guesses = len(output_touples)
    n_correct_answers = len(training_label_touples)
    precision = n_correct_guesses / n_guesses
    recall = n_correct_guesses / n_correct_answers
    f1_measure = 2*((precision * recall)/(precision + recall))
    # print("Correct Guesses: {}\nNumber of Guesses: {}\nPrecision: {} \n Recall: {}\n Score: {}".format(n_correct_guesses, n_guesses, precision, recall, f1_measure))
    # print(f1_measure)
    return f1_measure

In [3]:
def tfidf_cosine_matches(df, tfidf, doc_tokens_col, THRESH):
    """
    Processes df, returns guesses df with given doc_tokens_col, threshold, and tfidf
    """
    
    doc_tokens = df[doc_tokens_col]
    tfidf_embeddings = tfidf.fit_transform(doc_tokens)
    
    doc_ids = df['doc_index']
    rank_probs, rank_inds = rank_all_embeddings(tfidf_embeddings, doc_ids)
    
    pickle.dump(tfidf, open('tfidf_model.pkl', 'wb'))
    pickle.dump(tfidf_embeddings, open('tfidf_embeddings.pkl', 'wb'))

    guesses_output = create_guess_output(rank_probs, rank_inds, doc_ids, THRESH)
    return guesses_output

# Example

In [4]:
# Using previous key list to illustrate add_extra_keywords
KEYWORD_PATH = 'other_info/Keywords'
chem_syn = pd.read_excel(KEYWORD_PATH + "/02chem.list.xlsx").fillna(0)
crop_syn = pd.read_excel(KEYWORD_PATH + "/02crop.list.xlsx").fillna(0)
pest_syn = pd.read_excel(KEYWORD_PATH + "/02pest.list.xlsx").fillna(0)

# chem
s_len = chem_syn.shape[0]
chem_cols = chem_syn.columns
chem_list = list()
crop_list = list()
pest_list = list()
for i in range(s_len):
    # find the longest syn.
    base_word = chem_syn['synonym1'][i]
    for c in chem_cols:
        if chem_syn[c][i]!= 0 and len(chem_syn[c][i]) > len(base_word):
            base_word = chem_syn[c][i]
    chem_list.append(base_word)
    

# crop
s_len = crop_syn.shape[0]
crop_cols = crop_syn.columns
for i in range(s_len):
    # find the longest syn.
    base_word = crop_syn['synonym1'][i]
    for c in crop_cols:
        if crop_syn[c][i]!= 0 and len(crop_syn[c][i]) > len(base_word):
            base_word = crop_syn[c][i]
    crop_list.append(base_word)
            

# pest
s_len = pest_syn.shape[0]
pest_cols = pest_syn.columns
for i in range(s_len):
    # find the longest syn.
    base_word = pest_syn['synonym1'][i]
    for c in pest_cols:
        if pest_syn[c][i]!= 0 and len(pest_syn[c][i]) > len(base_word):
            base_word = pest_syn[c][i]
    pest_list.append(base_word)
    


key_list = (chem_list + crop_list + pest_list)


In [5]:
training_labels = pd.read_csv('other_info/TrainLabel.csv')

train_df = correctly_read_csv("processed_data_new.csv")

train_df[:1]


Unnamed: 0,doc_index,raw_text,tokens_rep,tokens_num,reference
0,1,梅雨季來臨文旦黑點病易發生請注意病徵以及早加強防治措施5月已進入梅雨季節近日連續降雨為文旦黑...,"[梅雨季, 來臨, 麻豆文旦, 黑點病, 易, 發生, 請, 注意, 病徵, 以, 及早, ...",147,[]


In [6]:
train_tokens = train_df['tokens_rep']
train_ids = train_df['doc_index']
tagged_data = [TaggedDocument(words=train_tokens[i], tags=[str(train_ids[i])]) for i in train_df.index]

In [7]:
# tagged_data

In [8]:
model = gensim.models.Doc2Vec(vector_size=30, min_count=2, epochs = 80)
model.build_vocab(tagged_data)

In [9]:
model.train(tagged_data, total_examples=model.corpus_count, epochs=80)

In [10]:
model.save("d2v.model")

In [14]:


similar_doc = model.docvecs.most_similar('1000')
print(similar_doc)

[('1005', 0.9590003490447998), ('1007', 0.9016110897064209), ('1015', 0.8284493684768677), ('1133', 0.8246675133705139), ('598', 0.81834477186203), ('279', 0.7886989116668701), ('1369', 0.7857670187950134), ('296', 0.7828602194786072), ('588', 0.7646927237510681), ('305', 0.7568822503089905)]


  similar_doc = model.docvecs.most_similar('1000')


In [27]:
def create_guess_output_d2v(model, doc_ids, THRESH):
    """
    Returns a df that contains doc_id and matching reference docs
    THRESH controls threshold of match probability   
    """
    dict_list = list()
    for i in range(len(doc_ids)):
        similar_docs = model.docvecs.most_similar(str(doc_ids[i]), topn=len(model.dv))
        this_ind = 0
        prob = similar_docs[this_ind][1]
        while prob >= THRESH:
            new_row = {'Test' :doc_ids[i], "Reference": int(similar_docs[this_ind][0])}   
            dict_list.append(new_row)         
            this_ind += 1
            try:
                prob = similar_docs[this_ind][1]
            except:
                prob = THRESH-1
    output_df = pd.DataFrame(dict_list)
    return output_df

def check_training_results(training_labels_df, guesses_df):
    # This is stupid but it works
    # Returns f1_measure (AI cup eval metric)
    output_touples = [tuple(r) for r in guesses_df.to_numpy()]
    training_label_touples = [tuple(r) for r in training_labels_df.to_numpy()]
    n_correct_guesses = len(set(output_touples) & set(training_label_touples))
    n_correct_guesses
    n_guesses = len(output_touples)
    n_correct_answers = len(training_label_touples)

    precision = n_correct_guesses / n_guesses
    recall = n_correct_guesses / n_correct_answers
    print("Correct Guesses: {}\nNumber of Guesses: {}\nPrecision: {} \n Recall: {}\n Score: ".format(n_correct_guesses, n_guesses, precision, recall))

    f1_measure = 2*((precision * recall)/(precision + recall))
    print("Correct Guesses: {}\nNumber of Guesses: {}\nPrecision: {} \n Recall: {}\n Score: {}".format(n_correct_guesses, n_guesses, precision, recall, f1_measure))
    # print(f1_measure)
    return f1_measure


In [28]:
check_training_results(training_labels, create_guess_output_d2v(model, train_ids, .7))

  similar_docs = model.docvecs.most_similar(str(doc_ids[i]), topn=len(model.dv))


Correct Guesses: 1050
Number of Guesses: 14264
Precision: 0.07361189007291083 
 Recall: 0.7592190889370932
 Score: 
Correct Guesses: 1050
Number of Guesses: 14264
Precision: 0.07361189007291083 
 Recall: 0.7592190889370932
 Score: 0.1342110308685371


0.1342110308685371

In [29]:
import multiprocessing
import workers
from tqdm import tqdm


params = {
    'vector_size' : list(range(10, 100, 5)),
    'min_count' : [1],
    'epochs' : [10],
    'window' : [1,2,3,4,5],
    'dm' : [0],
    "hs" :[1]
    # 'threshold' : [x / 100.0 for x in range(50, 100, 10)],
}
# parameters_combos = (dict(zip(params.keys(), values)) for values in itertools.product(*params.values()))

# print(len(list(parameters_combos)))
results_list = list()


for x in tqdm([x / 100.0 for x in range(50, 100, 5)]):
    params['threshold'] = [x]
    parameters_combos = (dict(zip(params.keys(), values)) for values in itertools.product(*params.values()))
    pool = multiprocessing.Pool()
    results = pool.starmap(workers.multiprocess_doc2vec, zip(parameters_combos, itertools.repeat(tagged_data), itertools.repeat(training_labels), itertools.repeat(train_ids)))
    results_list.append(results)


100%|██████████| 10/10 [16:14<00:00, 97.46s/it]


In [30]:
results_list_new = list()
for n in range(len(results_list)):
    for i in range(len(results_list[n])):
        results_list_new.append(results_list[n][i])
results_list_new
results_df = pd.DataFrame(results_list_new)
results_df.nlargest(n=50, columns=['score'])
results_df_sorted = results_df.sort_values(by=['score'], ascending=False)
results_df_sorted[0:50]

Unnamed: 0,vector_size,min_count,epochs,window,dm,hs,threshold,score,precision,recall
3334,30,1,10,4,0,1,0.95,0.389506,0.364151,0.418655
3142,75,1,10,1,0,1,0.9,0.388552,0.323699,0.4859
3446,60,1,10,2,0,1,0.95,0.3885,0.396241,0.381056
3462,65,1,10,1,0,1,0.95,0.388231,0.411669,0.367317
3366,40,1,10,2,0,1,0.95,0.388156,0.383099,0.393348
3198,85,1,10,5,0,1,0.9,0.387583,0.323643,0.483008
3430,55,1,10,3,0,1,0.95,0.386933,0.388484,0.385394
3486,70,1,10,2,0,1,0.95,0.386925,0.407853,0.36804
3362,40,1,10,1,0,1,0.95,0.386911,0.384835,0.389009
3442,60,1,10,1,0,1,0.95,0.386782,0.402344,0.372379
