In [2]:
import pandas as pd
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import csv
import itertools

# Functions

In [3]:
def correctly_read_csv(fname):
    prep_df = pd.read_csv(fname, converters={"tokens_rep": literal_eval, "tokens": literal_eval, "reference": literal_eval})
    return prep_df

def add_extra_keywords(df, key_list, n_repeat):
    """
    Takes df and keyword list, adds n_repeat occurances of keyword to new column when found in doc
    """
    df['tokens_rep_extra'] = df['tokens_rep']
    for ind in df.index:
        match_list = list()
        tokens_to_raw = "".join(df['tokens_rep'][ind])
        for keyword in key_list:
            n_instances = tokens_to_raw.count(keyword)
            i = 0
            while i < n_instances:
                match_list.append(keyword)
                i += 1
        if len(match_list) > 0:
            i = 0
            while i < n_repeat:
                df['tokens_rep_extra'][ind] = df['tokens_rep_extra'][ind] + match_list
                i += 1
    return df['tokens_rep_extra']

def only_keywords(df, key_list):
    """
    Will keep only keywords
    Right now is returning some blanks, doesn't work on all data sets
    """
    df['tokens_rep_only'] = ""
    for ind in df.index:
        match_list = list()
        for keyword in key_list:
            # Making everything into a bock of text to find broken up keywords
            if keyword in "".join(df['tokens_rep'][ind]):
                count = 0
                while count < "".join(df['tokens_rep'][ind]).count(keyword):
                    match_list.append(keyword)
                    count += 1
        if len(match_list) > 0:
            df['tokens_rep_only'][ind] = match_list
        else:
            print(df['doc_index'][ind])
            df['tokens_rep_only'][ind] = ['this is bad']
    return df['tokens_rep_only']

def fake_tokenizer(tokens):
    return tokens

def get_all_rep_token_strings(token_list):
    all_rep_token_strings = []
    for d in token_list:
        all_rep_token_strings.append(''.join(d))
    return all_rep_token_strings

def get_key_doc_dict(all_rep_token_strings, doc_ind_list):
    # key word, list of document ids that contain it
    key_doc_dict = {}
    for k in all_key_list:
        k = k.strip()
        if k:
            k_list = []
            for i in range(len(all_rep_token_strings)):
                if k in all_rep_token_strings[i]:
                    k_list.append(doc_ind_list[i])
            if k_list:
                key_doc_dict[k] = k_list
    return key_doc_dict

def get_doc_to_keyword_lst_dict(key_doc_dict):
    # dict with doc_id as key and list of keywords in it
    doc_to_keyword_lst_dict = {}
    for k,lst in key_doc_dict.items():
        for doc_id in lst:
            if doc_id in doc_to_keyword_lst_dict:
                doc_to_keyword_lst_dict[doc_id].append(k)
            else:
                doc_to_keyword_lst_dict[doc_id] = [k]
    return doc_to_keyword_lst_dict

def get_doc_to_docs_with_overlap_dict(doc_to_keyword_lst_dict):
    # get list of lists of overlapping keys per document
    doc_to_docs_with_overlap_dict = {}
    for doc_id, keywords_lst in doc_to_keyword_lst_dict.items():
        keyword_set = set(keywords_lst)
        for doc_id_next, keywords_lst_next in doc_to_keyword_lst_dict.items():
            if doc_id_next != doc_id:
                keyword_set_next = set(keywords_lst_next)
                intersection = keyword_set.intersection(keyword_set_next)
                if intersection:
                    if doc_id in doc_to_docs_with_overlap_dict:
                        doc_to_docs_with_overlap_dict[doc_id][doc_id_next] = intersection
                    else:
                        doc_to_docs_with_overlap_dict[doc_id] = {doc_id_next: intersection}
    return doc_to_docs_with_overlap_dict

def rank_most_similar_documents(input_doc_embedding, all_docs_embeddings):
    """
    Get top n matched documents plus their indexes
    Return list of tuples where first position in each tuple is index and second position is the probability
    """
    distances = cosine_similarity(input_doc_embedding, all_docs_embeddings)[0]
    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1], reverse=True)
    res_with_doc_index = []
    return results

def rank_most_similar_documents_docid(input_doc_embedding, all_docs_embeddings, doc_ids_all_embeddings):
    """
    Get top n matched documents plus their indexes
    Return list of tuples where first position in each tuple is index and second position is the probability
    """
    distances = cosine_similarity(input_doc_embedding, all_docs_embeddings)[0]
    results = zip(doc_ids_all_embeddings, distances)
    results = sorted(results, key=lambda x: x[1], reverse=True)
    res_with_doc_index = []
    return results

def rank_all_embeddings(all_embeddings, doc_index_list):
    rank_probs_all = []
    rank_inds_all = []
    for i in range(len(doc_index_list)):
        rank_results = rank_most_similar_documents(all_embeddings[i], all_embeddings)
        rank_probs = []
        rank_inds = []
        for ind,prob in rank_results:
            if ind != i:
                rank_probs.append(prob)
                rank_inds.append(doc_index_list[ind])
        rank_probs_all.append(rank_probs)
        rank_inds_all.append(rank_inds)
    return rank_probs_all, rank_inds_all

def rank_all_keyword_overlap_embeddings(search_df, doc_overlap_dict, all_embeds, id_to_ind):
    rank_probs_all = []
    rank_inds_all = []
    for i in range(len(search_df['doc_index'])):
        # get list of embeddings for just the input document
        if search_df['doc_index'][i] in doc_overlap_dict:
            overlap_inner_dict = doc_overlap_dict[search_df['doc_index'][i]]
            overlap_docs = overlap_inner_dict.keys()
            overlap_embeds = []
            for doc_id in overlap_docs:
                the_embed = all_embeds[id_to_ind[doc_id]].toarray().flatten()
                overlap_embeds.append(the_embed)
            if overlap_embeds:
                overlap_embeds = np.stack(overlap_embeds)
                #print(overlap_embeds.shape)
                rank_results = rank_most_similar_documents_docid(all_embeds[i], overlap_embeds, overlap_docs)
                rank_probs = []
                rank_inds = []
                for d_id,prob in rank_results:
                    rank_probs.append(prob)
                    rank_inds.append(d_id)
                rank_probs_all.append(rank_probs)
                rank_inds_all.append(rank_inds)
            else:
                rank_probs_all.append([])
                rank_inds_all.append([])
        else:
            rank_probs_all.append([])
            rank_inds_all.append([])
    return rank_probs_all, rank_inds_all

def create_guess_output(rank_probs, rank_inds, doc_ids, THRESH):
    """
    Returns a df that contains doc_id and matching reference docs
    THRESH controls threshold of match probability   
    """
    probs_to_save = []
    inds_to_save = []
    for i in range(len(rank_inds)):
        this_ind_probs_to_save = []
        this_ind_inds_to_save = []
        this_ind = 0
        prob = rank_probs[i][this_ind]
        while prob >= THRESH:
            this_ind_probs_to_save.append(prob)
            this_ind_inds_to_save.append(rank_inds[i][this_ind])
            this_ind += 1
            prob = rank_probs[i][this_ind]
        probs_to_save.append(this_ind_probs_to_save)
        inds_to_save.append(this_ind_inds_to_save)

    # Creating output df with index and match
    dict_list = list()
    for i in range(len(doc_ids)):
        if len(inds_to_save[i]) == 0:
            pass
        else:
            for ind in inds_to_save[i]:
                new_row = {'Test' :doc_ids[i], "Reference": ind}
                dict_list.append(new_row)
    output_df = pd.DataFrame(dict_list)
    return output_df

def check_training_results(training_labels_df, guesses_df):
    # This is stupid but it works
    # Returns f1_measure (AI cup eval metric)
    output_touples = [tuple(r) for r in guesses_df.to_numpy()]
    training_label_touples = [tuple(r) for r in training_labels_df.to_numpy()]
    n_correct_guesses = len(set(output_touples) & set(training_label_touples))
    n_correct_guesses
    n_guesses = len(output_touples)
    n_correct_answers = len(training_label_touples)
    precision = n_correct_guesses / n_guesses
    recall = n_correct_guesses / n_correct_answers
    f1_measure = 2*((precision * recall)/(precision + recall))
    # print("Correct Guesses: {}\nNumber of Guesses: {}\nPrecision: {} \n Recall: {}\n Score: {}".format(n_correct_guesses, n_guesses, precision, recall, f1_measure))
    # print(f1_measure)
    return f1_measure

In [4]:
def tfidf_cosine_matches(df, tfidf, doc_tokens_col, THRESH):
    """
    Processes df, returns guesses df with given doc_tokens_col, threshold, and tfidf
    """
    
    doc_tokens = df[doc_tokens_col]
    tfidf_embeddings = tfidf.fit_transform(doc_tokens)
    
    doc_ids = df['doc_index']
    rank_probs, rank_inds = rank_all_embeddings(tfidf_embeddings, doc_ids)
    
    pickle.dump(tfidf, open('tfidf_model.pkl', 'wb'))
    pickle.dump(tfidf_embeddings, open('tfidf_embeddings.pkl', 'wb'))

    guesses_output = create_guess_output(rank_probs, rank_inds, doc_ids, THRESH)
    return guesses_output

# Example

In [5]:
training_labels = pd.read_csv('other_info/TrainLabel.csv')

train_df = correctly_read_csv("processed_data.csv")
tfidf = TfidfVectorizer(sublinear_tf=True, tokenizer=fake_tokenizer, lowercase=False)

training_guess = tfidf_cosine_matches(train_df, tfidf, "tokens_rep", .5)

check_training_results(training_labels, training_guess)

0.39215686274509803

In [6]:
# Using previous key list to illustrate add_extra_keywords
KEYWORD_PATH = 'other_info/Keywords'
chem_syn = pd.read_excel(KEYWORD_PATH + "/02chem.list.xlsx").fillna(0)
crop_syn = pd.read_excel(KEYWORD_PATH + "/02crop.list.xlsx").fillna(0)
pest_syn = pd.read_excel(KEYWORD_PATH + "/02pest.list.xlsx").fillna(0)

# chem
s_len = chem_syn.shape[0]
chem_cols = chem_syn.columns
chem_list = list()
crop_list = list()
pest_list = list()
for i in range(s_len):
    # find the longest syn.
    base_word = chem_syn['synonym1'][i]
    for c in chem_cols:
        if chem_syn[c][i]!= 0 and len(chem_syn[c][i]) > len(base_word):
            base_word = chem_syn[c][i]
    chem_list.append(base_word)
    

# crop
s_len = crop_syn.shape[0]
crop_cols = crop_syn.columns
for i in range(s_len):
    # find the longest syn.
    base_word = crop_syn['synonym1'][i]
    for c in crop_cols:
        if crop_syn[c][i]!= 0 and len(crop_syn[c][i]) > len(base_word):
            base_word = crop_syn[c][i]
    crop_list.append(base_word)
            

# pest
s_len = pest_syn.shape[0]
pest_cols = pest_syn.columns
for i in range(s_len):
    # find the longest syn.
    base_word = pest_syn['synonym1'][i]
    for c in pest_cols:
        if pest_syn[c][i]!= 0 and len(pest_syn[c][i]) > len(base_word):
            base_word = pest_syn[c][i]
    pest_list.append(base_word)
    


key_list = (chem_list + crop_list + pest_list)


In [7]:
# train_df['tokens_rep_extra'] = add_extra_keywords(train_df, key_list, n_repeat = 2)
# training_guess_2extra = tfidf_cosine_matches(train_df, tfidf, "tokens_rep_extra", .8)

# check_training_results(training_labels, training_guess_2extra)

In [8]:
params = {
    'ngram_range' : [(1,1), (1,2), (2,2)],
    'max_df' : [x / 100.0 for x in range(80, 105, 5)],
    'min_df' : [x for x in range(5)],
    # # # Might test max_features later
    # # # "max_features" : list(range(100, 600, 50)),
    # 'binary' : [True, False],
    # 'norm' : ['l1', 'l2'],
    # 'use_idf' : ['True', 'False'],
    # # 'smooth_idf' : ['True', 'False'],
    'sublinear_tf' : ['True','False'],
    'threshold' : [x / 100.0 for x in range(50, 100, 5)],
}

parameters_combos = (dict(zip(params.keys(), values)) for values in itertools.product(*params.values()))

In [9]:
def multiprocess_tfidf(params_combo, df):
    best_score = 0
    best_combo = ""
    for combo in parameters_combos:
        print(combo)
        tfidf = TfidfVectorizer(
            # ngram_range = combo['ngram_range'],
            # max_df = combo['max_df'],
            # min_df = combo['min_df'],
            # binary = combo['binary'],
            # norm = combo['norm'],
            # use_idf = combo['use_idf'],
            # smooth_idf = combo['smooth_idf'],
            # sublinear_tf = combo['sublinear_tf'],
            tokenizer=fake_tokenizer,
            lowercase=False
        )
        try:
            training_guess = tfidf_cosine_matches(df, tfidf, "tokens_rep", THRESH = combo['threshold'])
            score = check_training_results(training_labels, training_guess)
            combo['score'] = score
            if score > best_score:
                best_score = score
                best_combo = [combo, score]
                print("new best! {}".format(best_combo))
        except:
            pass
    return {"best_combo" : best_combo, "best_score": best_score, "full_results" : params_combo}

In [49]:
import multiprocessing
import workers
from tqdm import tqdm

# results = multiprocess_tfidf(parameters_combos, train_df)

# print(len(parameters_combos))


# pool = multiprocessing.Pool()

# results = pool.starmap(workers.multiprocess_tfidf, zip(parameters_combos, itertools.repeat(train_df), itertools.repeat(training_labels)))

results_list = list()
for i in tqdm(range(50, 100, 25)):
    parameters_combos = (dict(zip(params.keys(), values)) for values in itertools.product(*params.values()))
    train_df['tokens_rep_extra'] = add_extra_keywords(train_df, key_list, n_repeat = i)
    pool = multiprocessing.Pool()

    results = pool.starmap(workers.multiprocess_tfidf, zip(parameters_combos, itertools.repeat(train_df), itertools.repeat(training_labels)))
    results_temp_df = pd.DataFrame(results)
    results_temp_df["n_repeat"] = i
    results_list.append(results_temp_df)
 





A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tokens_rep_extra'][ind] = df['tokens_rep_extra'][ind] + match_list
100%|██████████| 2/2 [14:05<00:00, 422.94s/it]


In [60]:
# results_df_2 = pd.concat(results_list)
# results_df_2.nlargest(n=50, columns=['score'])
results_df_sorted2 = results_df2.sort_values(by=['score'], ascending=False)
results_df_sorted2[0:50]

NameError: name 'results_df2' is not defined

In [51]:
results_df = pd.concat(results_list)
results_df.nlargest(n=50, columns=['score'])
results_df[results_df['n_repeat'] == 21]
results_df_sorted = results_df.sort_values(by=['score'], ascending=False)
results_df_sorted[0:50]
# results_df_sorted[results_df_sorted['n_repeat'] == 21][0:50]

Unnamed: 0,ngram_range,max_df,min_df,sublinear_tf,threshold,score,precision,recall,n_repeat
955,"(1, 2)",1.0,2,False,0.75,0.49328,0.49562,0.490962,50
125,"(1, 1)",0.85,1,True,0.75,0.49328,0.49562,0.490962,50
255,"(1, 1)",0.9,2,False,0.75,0.49328,0.49562,0.490962,50
805,"(1, 2)",0.95,0,True,0.75,0.49328,0.49562,0.490962,50
265,"(1, 1)",0.9,3,True,0.75,0.49328,0.49562,0.490962,50
1185,"(2, 2)",0.85,4,True,0.75,0.49328,0.49562,0.490962,50
1195,"(2, 2)",0.85,4,False,0.75,0.49328,0.49562,0.490962,50
275,"(1, 1)",0.9,3,False,0.75,0.49328,0.49562,0.490962,50
795,"(1, 2)",0.9,4,False,0.75,0.49328,0.49562,0.490962,50
285,"(1, 1)",0.9,4,True,0.75,0.49328,0.49562,0.490962,50


In [None]:
training_labels = pd.read_csv('other_info/TrainLabel.csv')

train_df = correctly_read_csv("processed_data.csv")
tfidf = TfidfVectorizer(sublinear_tf=True, 
binary = False, 
tokenizer=fake_tokenizer, 
lowercase=False,
smooth_idf=True,
# max_df = .5,
# min_df = .4,
norm= "l1"
)

training_guess = tfidf_cosine_matches(train_df, tfidf, "tokens_rep", .6)

check_training_results(training_labels, training_guess)



0.40382493563810223

In [43]:
# df = train_df
# df['match'] = 0
# all_pairs = list()
# for keyword in tqdm(key_list):
#     keyword_match = list()
#     for ind in df.index:
#         if keyword in "".join(df['tokens_rep'][ind]):
#             df['match'][ind] = 1
#     keyword_match = df[df['match']>0]['doc_index'].tolist()
#     keyword_match_pairs = list(itertools.permutations(keyword_match, 2))
#     all_pairs = all_pairs + keyword_match_pairs

# all_pairs

all_keywords = pd.concat([chem_syn, crop_syn, pest_syn])
all_key_list = all_keywords.synonym1.tolist() + all_keywords.synonym2.tolist() + all_keywords.synonym3.tolist() + all_keywords.synonym4.tolist() + all_keywords.synonym5.tolist() + all_keywords.synonym6.tolist() + all_keywords.synonym7.tolist()
all_key_list = [k for k in all_key_list if (k != 0 and type(k) != float)]
all_key_set = set(all_key_list)


test_tokens = train_df['tokens_rep']
joined_docs = get_all_rep_token_strings(test_tokens)    
key_doc_dict = get_key_doc_dict(joined_docs, train_df['doc_index'])
doc_to_keyword_lst_dict = get_doc_to_keyword_lst_dict(key_doc_dict)
doc_to_docs_with_overlap_dict = get_doc_to_docs_with_overlap_dict(doc_to_keyword_lst_dict)           
id_to_ind = [(train_df['doc_index'][i],i) for i in range(len(train_df['doc_index']))]
id_to_ind
ind_to_id = [(id_to_ind[i][1], id_to_ind[i][0]) for i in range(len(id_to_ind))]
ind_to_id
all_matches = ind_to_id + id_to_ind
all_matches
# TODO all permutations of IDs that match
# TODO change number of matches


[(0, 1),
 (1, 10),
 (2, 1000),
 (3, 1005),
 (4, 1007),
 (5, 1010),
 (6, 1011),
 (7, 1015),
 (8, 1016),
 (9, 1023),
 (10, 1025),
 (11, 103),
 (12, 1033),
 (13, 1037),
 (14, 1039),
 (15, 104),
 (16, 1041),
 (17, 1049),
 (18, 105),
 (19, 1052),
 (20, 1054),
 (21, 1056),
 (22, 1066),
 (23, 1067),
 (24, 1068),
 (25, 107),
 (26, 1073),
 (27, 1074),
 (28, 1075),
 (29, 1076),
 (30, 108),
 (31, 1082),
 (32, 1086),
 (33, 1087),
 (34, 1088),
 (35, 1093),
 (36, 110),
 (37, 1103),
 (38, 1111),
 (39, 1114),
 (40, 1116),
 (41, 1118),
 (42, 1119),
 (43, 1120),
 (44, 1125),
 (45, 1126),
 (46, 1127),
 (47, 1128),
 (48, 113),
 (49, 1130),
 (50, 1132),
 (51, 1133),
 (52, 1141),
 (53, 1142),
 (54, 1146),
 (55, 1147),
 (56, 1148),
 (57, 1151),
 (58, 1153),
 (59, 1155),
 (60, 1157),
 (61, 1159),
 (62, 1161),
 (63, 1165),
 (64, 1166),
 (65, 1167),
 (66, 117),
 (67, 1170),
 (68, 1172),
 (69, 1173),
 (70, 1176),
 (71, 1178),
 (72, 1180),
 (73, 1185),
 (74, 1186),
 (75, 1187),
 (76, 119),
 (77, 1190),
 (78, 1191

In [44]:
def create_guess_output_neg_match(rank_probs, rank_inds, doc_ids, THRESH, matches_list):
    """
    Returns a df that contains doc_id and matching reference docs
    THRESH controls threshold of match probability   
    """
    probs_to_save = []
    inds_to_save = []
    for i in range(len(rank_inds)):
        this_ind_probs_to_save = []
        this_ind_inds_to_save = []
        this_ind = 0
        prob = rank_probs[i][this_ind]
        while prob >= THRESH:
            this_ind_probs_to_save.append(prob)
            this_ind_inds_to_save.append(rank_inds[i][this_ind])
            this_ind += 1
            prob = rank_probs[i][this_ind]
        probs_to_save.append(this_ind_probs_to_save)
        inds_to_save.append(this_ind_inds_to_save)

    # Creating output df with index and match
    dict_list = list()
    for i in range(len(doc_ids)):
        if len(inds_to_save[i]) == 0:
            pass
        else:
            for ind in inds_to_save[i]:
                new_row = {'Test' :doc_ids[i], "Reference": ind}
                dict_list.append(new_row)
    output_df = pd.DataFrame(dict_list)
    
    return output_df

In [45]:

doc_ids = train_df['doc_index']
doc_tokens = train_df['tokens_rep']
tfidf_embeddings = tfidf.fit_transform(doc_tokens)
rank_probs, rank_inds = rank_all_embeddings(tfidf_embeddings, doc_ids)
check = create_guess_output_neg_match(rank_probs, rank_inds, doc_ids, THRESH = 0.4, matches_list = [])


In [46]:
check_copy = check.copy()
for ind in check.index:
    if (check['Test'][ind], check['Reference'][ind]) in all_matches:
        pass
    else:
        check = check.drop(index=ind)
print(len(check_copy), len(check))

3030 6


In [47]:
check

Unnamed: 0,Test,Reference
1658,141,171
1692,171,141
1732,222,268
1784,268,222
2351,552,977
3019,977,552


In [48]:
len(all_matches)

1120