# Recommending ICPSR datasets

Exploring the relationships between datasets based on their available variables.

## Import Modules

In [10]:
import os, json, csv
from tqdm import tqdm
tqdm.pandas()
import string
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
import numpy as np
import spacy
print('spaCy Version: %s' % (spacy.__version__))
nlp = spacy.load('en')
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

print('Number of stop words: %d' % len(spacy_stopwords))
print('First ten stop words: %s' % list(spacy_stopwords)[:10])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dogrdon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/dogrdon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
spaCy Version: 2.0.18
Number of stop words: 305
First ten stop words: ['sometime', 'latterly', 'whoever', 'now', 'to', 'twelve', 'these', 'did', 'while', 'about']


## Loading the data

In [2]:
studies_metadata_json = '../input_metadata/ICPSR_studies_metadata.json' # 23MB
vars_metadata_csv = '../input_metadata/all_vars.csv'                    # 1.3GB
doi_xwalk_csv = '../input_metadata/doi_xwalk.csv'                       # 509KB

### Functions

In [3]:
def get_study_url(var_url):
    '''Get url to dataset for var'''
    study_url= var_url.split('/datasets/')[0].replace('ssvd/', '')
    return study_url


def xwalk_dict(xwalk_csv):
    '''Load xwalk csv into a native dictionary for faster lookup'''
    xwalk_lookup = {}
    with open( xwalk_csv, 'r', encoding='utf-8' ) as ifile:
        reader = csv.reader(ifile)
        header = next(reader)
        for row in reader:
            xwalk_lookup[row[0]] = row[1]
    return xwalk_lookup

def trim_dataset_name(dataset_name):
    try:
        dataset_name = dataset_name.replace("Taken from: ", "").strip()
    except AttributeError as ae:
        print("{}: {}".format(ae, dataset_name))
    return dataset_name

def make_clean_tokens_nltk(label):
    '''Remove punctuation, stopwords and do tokenization with NLTK'''
    stopset = set(stopwords.words('english'))
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in label if not w in stopset]
    return tokens

def make_clean_tokens_spacy(label):
    '''This is mean to remove punctuation and tokenize but it takes waaay too long.'''
    clean_label_tokens = []
    try:
        table = str.maketrans({key: None for key in string.punctuation})
        # first remove punctuation
        label_no_punc = label.translate(table) 
        # then tokenize and remove stopwords 
        doc = nlp(label_no_punc)
        clean_label_tokens = [token.text for token in doc if not token.is_stop]
    except AttributeError as ae:
        print("{}: {}".format(ae, label))
    return clean_label_tokens

In [4]:
vars_df = pd.read_csv(vars_metadata_csv)
vars_df = vars_df.replace(np.nan, '', regex=True)

In [5]:
vars_df.head()

Unnamed: 0,var_id,var_name,var_url,var_label,var_dataset
0,1,DRG7ING4,https://www.icpsr.umich.edu/icpsrweb/ICPSR/ssv...,Ingredient code #4 for medication #7,Taken from: National Hospital Ambulatory Medic...
1,2,DRG7ING5,https://www.icpsr.umich.edu/icpsrweb/ICPSR/ssv...,Ingredient code #5 for medication #7,Taken from: National Hospital Ambulatory Medic...
2,3,GEN8,https://www.icpsr.umich.edu/icpsrweb/ICPSR/ssv...,Generic name code for medication #8,Taken from: National Hospital Ambulatory Medic...
3,4,PRESCR8,https://www.icpsr.umich.edu/icpsrweb/ICPSR/ssv...,Prescription status code for medication #8,Taken from: National Hospital Ambulatory Medic...
4,5,CONTSUB8,https://www.icpsr.umich.edu/icpsrweb/ICPSR/ssv...,Controlled substance code for medication #8,Taken from: National Hospital Ambulatory Medic...


In [6]:
print("Number of records: {}".format(len(vars_df)))
print("Number of unique variable urls: {}".format(len(vars_df.var_url.unique())))
print("Number of unique variable names: {}".format(len(vars_df.var_name.unique())))

Number of records: 4960317
Number of unique variable urls: 4960317
Number of unique variable names: 1470415


In [7]:
vars_df['study_url'] = vars_df.progress_apply(lambda x: get_study_url(x['var_url']), axis=1)

In [8]:
vars_df['dataset_name'] = vars_df.progress_apply(lambda x: trim_dataset_name(x['var_dataset']), axis=1)

In [11]:
vars_df['label_processed'] = vars_df.progress_apply(lambda x: make_clean_tokens_nltk(x['var_label']), axis=1)

100%|██████████| 4960317/4960317 [36:14<00:00, 2280.88it/s] 


In [12]:
vars_df.head()

Unnamed: 0,var_id,var_name,var_url,var_label,var_dataset,study_url,dataset_name,label_processed
0,1,DRG7ING4,https://www.icpsr.umich.edu/icpsrweb/ICPSR/ssv...,Ingredient code #4 for medication #7,Taken from: National Hospital Ambulatory Medic...,https://www.icpsr.umich.edu/icpsrweb/ICPSR/stu...,National Hospital Ambulatory Medical Care Surv...,"[I, n, g, r, e, e, n, , c, e, , , 4, , f, r..."
1,2,DRG7ING5,https://www.icpsr.umich.edu/icpsrweb/ICPSR/ssv...,Ingredient code #5 for medication #7,Taken from: National Hospital Ambulatory Medic...,https://www.icpsr.umich.edu/icpsrweb/ICPSR/stu...,National Hospital Ambulatory Medical Care Surv...,"[I, n, g, r, e, e, n, , c, e, , , 5, , f, r..."
2,3,GEN8,https://www.icpsr.umich.edu/icpsrweb/ICPSR/ssv...,Generic name code for medication #8,Taken from: National Hospital Ambulatory Medic...,https://www.icpsr.umich.edu/icpsrweb/ICPSR/stu...,National Hospital Ambulatory Medical Care Surv...,"[G, e, n, e, r, c, , n, e, , c, e, , f, r, ..."
3,4,PRESCR8,https://www.icpsr.umich.edu/icpsrweb/ICPSR/ssv...,Prescription status code for medication #8,Taken from: National Hospital Ambulatory Medic...,https://www.icpsr.umich.edu/icpsrweb/ICPSR/stu...,National Hospital Ambulatory Medical Care Surv...,"[P, r, e, c, r, p, n, , u, , c, e, , f, r, ..."
4,5,CONTSUB8,https://www.icpsr.umich.edu/icpsrweb/ICPSR/ssv...,Controlled substance code for medication #8,Taken from: National Hospital Ambulatory Medic...,https://www.icpsr.umich.edu/icpsrweb/ICPSR/stu...,National Hospital Ambulatory Medical Care Surv...,"[C, n, r, l, l, e, , u, b, n, c, e, , c, e, ..."


#### Load xwalk dictionary

In [None]:
xwalk_dict = xwalk_dict(doi_xwalk_csv)

In [None]:
xwalk_dict

In [None]:
list(vars_df.study_url.head())

## Explore the data

Rough exploration of variables related to datasets (studies)

In [None]:
datasets_df = pd.read_json(studies_metadata_json)

In [None]:
datasets_df.head()

## Approaches To Determining Similarity between datasets

1.**TF-IDF vectors**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel #from https://stackoverflow.com/questions/12118720/python-tf-idf-cosine-to-find-document-similarity
import numpy as np

In [None]:
documents = vars_df[['study_url','dataset_name', 'var_name', 'var_label']]

In [None]:
documents.head()

We are only going to develop a vector space composed of dataset variable descriptions.

In [None]:
tfidf_vec = TfidfVectorizer()

In [None]:
descriptions = documents['var_label']

In [None]:
descriptions = descriptions.fillna('')

In [None]:
descriptions_list = list(descriptions)

In [None]:
tfidf_matrix = tfidf_vec.fit_transform(descriptions_list)

In [None]:
print(tfidf_matrix.shape)

2. **Select a random document by index**

In [None]:
DOCUMENT_TARGET = 4488
#documents.iloc[ DOCUMENT_TARGET , : ]

In [None]:
#sample = list(documents.iloc[ DOCUMENT_TARGET , : ])

In [None]:
documents_list = documents.values.tolist()
sample = documents_list[DOCUMENT_TARGET]

In [None]:
sample

In [None]:
cos_sim = linear_kernel(tfidf_matrix[DOCUMENT_TARGET], tfidf_matrix).flatten()

In [None]:
filt_vars = np.asarray([x for x in cos_sim.tolist() if x > 0.10 and x < 0.9999], dtype=np.float32)

In [None]:
related_vars = cos_sim.argsort()[:-25:-1]

In [None]:
related_vars

In [None]:
for i in related_vars:
    print(cos_sim[i])

In [None]:
print("Most similar to: {}".format(documents_list[DOCUMENT_TARGET]))
column_id = documents_list[DOCUMENT_TARGET][0]
for i in related_vars:
    # remove any results where the variable is in the same dataset
    # and remove any scores that are above 0.90 (as they are probably the same variable)
    if cos_sim[i] < 0.90 and column_id != documents_list[i][0]:
        print("score: {} \n Document: {}".format(cos_sim[i], documents_list[i]))

These results aren't great. Appears that something like 1000 is what is making the relationship. We may need additional text such as the dataset description. We probably also want to do some basic text processing to the descriptions first (remove stopwords, lemmatize/stem, remove unecessary characters (`#`, `$`, etc.).

### To try: SpaCy (https://spacy.io/usage/vectors-similarity)

In [None]:
doc = nlp("This is my sentence where i am talking about the thing that thing that's being descussed. Or: you know! 123# $100.00()")
tokens = [token.text for token in doc if not token.is_stop]

print(tokens)

In [None]:
def my_component(doc):
    print("After tokenization, this doc has %s tokens." % len(doc))
    if len(doc) < 10:
        print("This is a pretty short document.")
    return doc

nlp.add_pipe(my_component, name='print_info', first=True)
print(nlp.pipe_names) # ['print_info', 'tagger', 'parser', 'ner']
doc = nlp(u"This is a sentence.")