In [9]:
import pandas as pd
import csv
import json

# TODO: CLEAN DATES
# TODO: CLEAN STRANGE SYMBOLS
# TODO: AIRFLOW
# WHY JSON

def read_csv_to_dict(csv_filename):

    # Read csv files into dictionaries

    reader = csv.DictReader(open(csv_filename))
    list_dict = [row for row in reader]
    return list_dict

def search_word_in_text(word, text):

    # Search for a word in text

    text = "" if not text else text
    return True if word.lower() in text.lower() else False

def find_mentions_list_dict(input_list_dict, input_column_id, search_list_dict, search_column_id, data_type):

    ''' 
    Find drug mentions in a given dataset, and create a list of dictionary entries for each mention.
    Input: Drug dataset and target dataset: pubmd/clinical trials

    Output format: A list of dict entries
    Example:
        {
        "atccode": "A04AD",
        "drug": "DIPHENHYDRAMINE",
        "type": "pubmd",
        "date_mention": "01/01/2019",
        "title": "A 44-year-old man with erythema of the face diphenhydramine, neck, and chest, weakness, and palpitations"
    }
    '''

    mention_list = []
    for input_dict in input_list_dict:

        text_to_search = input_dict[input_column_id]
        for search_dict in search_list_dict:

            search_sentence = search_dict[search_column_id]
            if search_word_in_text(text_to_search, search_sentence):
                
                entry_dict = input_dict.copy()
                entry_dict['type'] = data_type
                entry_dict['date_mention'] = search_dict['date']
                
                if data_type =='journal':
                    entry_dict['journal'] = search_dict['journal']  
                else: 
                    entry_dict['title'] = search_dict[search_column_id] 

                mention_list.append(entry_dict)
            entry_dict={}


    return mention_list

def dict_to_json(input_dict,out_path):

    # Save a dictionary into json format

    with open(out_path, "w") as outfile:
        json.dump(input_dict, outfile, indent=4, sort_keys=False)


# Read input files

drugs_list_dict = read_csv_to_dict('data/drugs.csv')
pubmd_list_dict = read_csv_to_dict('data/pubmed.csv')
clinical_trials_list_dict = read_csv_to_dict('data/clinical_trials.csv')

# Find drug mentions in pubmd

pubmd_mentions_list_of_dicts = find_mentions_list_dict(
    input_list_dict = drugs_list_dict,
    input_column_id = 'drug',
    search_list_dict = pubmd_list_dict,
    search_column_id = 'title',
    data_type = 'pubmd')

# Find drug mentions in clinical trials

clinical_trial_mentions_list_of_dicts  = find_mentions_list_dict(
    input_list_dict = drugs_list_dict,
    input_column_id = 'drug',
    search_list_dict = clinical_trials_list_dict,
    search_column_id = 'scientific_title',
    data_type = 'clinical_trials')

# Find drug mentions in journals

journal_mentions_list_of_dicts = find_mentions_list_dict(
    input_list_dict = drugs_list_dict,
    input_column_id = 'drug',
    search_list_dict = pubmd_list_dict,
    search_column_id = 'title',
    data_type='journal')

# Merge all mentions
all_mentions_list_of_dicts= pubmd_mentions_list_of_dicts  + clinical_trial_mentions_list_of_dicts  + journal_mentions_list_of_dicts 

# Save to json
dict_to_json(all_mentions_list_of_dicts,"output/graph_entries.json")
