In [129]:
import datasets
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
import gensim
import re

# Import NLTK for Text Preprocessing
from nltk.tokenize import regexp_tokenize
from nltk import pos_tag, download
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import matplotlib.pyplot as plt

from sklearn.metrics import classification_report

### Load Latest LDA Model

In [4]:
!pwd

/Users/spenno_fr/Projects/nlp-glg-mas/notebooks


In [7]:
filename = '../models/LDA/0.31/embedding/id2word_v0.3.pkl'
file = open(filename,'rb')
id2word = pickle.load(file)

lda_model = gensim.models.ldamodel.LdaModel.load('../models/LDA/0.31/model/lda_model_v0.31')
                                                 
filename = '../models/LDA/0.31/topic_mapping/topics_dict_v0.31.pkl'
file = open(filename,'rb')
topics_dict = pickle.load(file)                                                                                                                                                  

In [185]:
topics_dict

{0: 'Mexican Telecommunications',
 1: 'Education & Student Life',
 2: '',
 3: 'International Big Business',
 4: 'Drugs, Clinical Trials, Approvals',
 5: 'Robots & Robotics',
 6: 'Indian Tech & Business',
 7: 'Meat Substitutes',
 8: 'Fish',
 9: 'Israeli Tech & Business',
 10: 'Tech M&A',
 11: '',
 12: 'Vacation Rental',
 13: 'Laptops, Mobile Devices, Gadgets',
 14: 'Audio Tech',
 15: 'Climate Science',
 16: 'International Govt Relations',
 17: 'Digital Advertising',
 18: '',
 19: 'Marijuana & CBD',
 20: 'Ridesharing Services, E-Scooters & E-Bikes',
 21: '',
 22: 'Wireless Charging Technology',
 23: 'Diversity & Discrimination',
 24: 'Basketball',
 25: 'Mass Transit',
 26: 'E-Commerce & Online Delivery',
 27: 'European Alternative Energy',
 28: 'Menstrual Health',
 29: 'Cancer Treatments & Trials',
 30: '',
 31: 'Apple Devices',
 32: 'Motor Racing',
 33: 'Food Poisoning, Allergies, Household Chemicals',
 34: 'Health Insurance & Washington Legislation',
 35: 'Asian Business',
 36: 'Astrop

In [37]:
tech_labels = set(['Robots & Robotics','Indian Tech & Business','Israeli Tech & Business','Tech M&A',
               'Laptops, Mobile Devices, Gadgets','Laptops, Mobile Devices, Gadgets','Audio Tech',
               'Climate Science','Digital Advertising','Ridesharing Services, E-Scooters & E-Bikes',
               'Wireless Charging Technology','E-Commerce & Online Delivery',
               'Apple Devices','Astrophysics','Automotive Technology','Virtual Reality','Audio Streaming And Apps',
               'Photo & Video Sharing','Virtual Assistants, Smart Home Tech','Data Breaches & Data Privacy',
               'Drones & Unmanned Aerial Tech','Mobile Networks, 5G','Autonomous Vehicles','Facebook Govt Intervention',
               'Hacks & Data Theft','Gaming Platforms','Astrophysics','Battery Technology',
               'Tech Investments, VC, New Initiatives',
               'New Initiatives','Space Exploration','Big Tech Govt Intervention',
               'Fake News, Misinformation, Dangerous Online Content','Twitter','Cybersecurity','Wearable Devices',
               'Cutting-Edge Mobile Devices, Gadgets','Quantum Computing','Fintech & Payments','Microsoft Apps, OS, Devices',
               'Telecoms Networks & Providers','Smartphone Tech','FAANG','Climate Science & Technology',
               'Automotive Technology','Artificial Intelligence','Chips & Processors','Astronomy & Astrophysics',
               'Software/Service Glitches, Bugs & Outages','Video Games','Nuclear Tech','Messaging Platforms',
               'Aviation & Aerospace','Amazon','Code & Software','Computer Peripherals','Satellites & Rockets',
               'Dark Web & Online Privacy','Surveillance Tech','Mobile Networks, Carriers & Eqpt','Space Exploration',
               'Apps, Gadgets & Devices'])
health_labels = (['Drugs, Clinical Trials, Approvals','Meat Substitutes','Menstrual Health',
                 'Cancer Treatments & Trials','Food Poisoning, Allergies, Household Chemicals',
                 'Health Insurance & Washington Legislation','Organ Donation','Skincare','Food & Diet', 'Product Recalls',
                 'Fitness, Exercise & Diet','Mood Disorders','Sleep Health','Infectious Diseases',
                 'Health Aspects Of Body Art','Infant Nutrition','Genetic Engineering','Bloodborne Diseases & Vaccines',
                 'Medical Devices','Bacteria & Viruses','The Human Brain','Blood And Blood Diseases',
                 'Sex And Relationships','Clinical Care','Birth Control & Reproductive Health',"Children's Health",
                 'Mental Health & Disorders','Prescription Drugs','Experimental Studies'])


# tech_health_labels = tech_labels + health_labels

In [127]:
# Predict Topics
def topic_predict(tokenized_query, embedding, lda_model, topics_dict):  

    # Clean up the text into a corpus
    # tokenized_input = clean_query(query)

    # Mapped embedding from Cleaned corpus text in list form
    corpus = embedding.doc2bow(tokenized_query)

    np.random.seed(4)

    # Model Predicts on cleaned corpus
    output = list(lda_model[corpus])
    
    # Edge Case: Garbage in gets mapped to 83 so that it predicts the 'other' class
    if len(output) == 0:
        return [83]

    # Post-Process Output for Display
    ordered = sorted(output,key=lambda x:x[1],reverse=True)


    # DEBUGGING
#     print(len(ordered), ordered)


    # Issue Here
    primary_topic = ordered[0][0]

    threshold = 0.5

    secondary_topics = [pair[0] for pair in ordered[1:] if pair[1] / ordered[0][1] > threshold]

    # Promote Secondary Topics in case the Primary Topic was NULL
    # Check LDA output and try to bump any non-empty-string 2ndary topics to 1ary if necessary
    primary_topic, secondary_topics = check_topics(topics_dict, primary_topic, secondary_topics)

    # Will return integer topics
    return primary_topic, secondary_topics

In [54]:
# Lemmatize Text
def func_lemmatize(words, wnl):
    lemmatized = []
    
    for word, tag in pos_tag(words):
        wntag = tag[0].lower()
        wntag = wntag if wntag in ['a','r','n','v'] else None
        
        # Call to 'wnl' object
        lemma = wnl.lemmatize(word,wntag) if wntag else word
        
        lemmatized.append(lemma)
    return lemmatized


# Return Cleaned Words
def clean_query(query, pattern, stop, wnl):

    # Call to regex pattern
    tokenized = regexp_tokenize(query, pattern)
    indiv_words = [word for word in tokenized if word.isalpha()]
    lemmatized = func_lemmatize(indiv_words, wnl)
    
    # Call to 'stop' for stopword removal
    words = [word.lower() for word in lemmatized if word not in stop]

    return words


# Run Text Cleaning Routine

#### Q: Mukesh/Gaurav: THIS SHOULD BE RUN ON INSTANCE STARTUP?
# Instantiate Lemmatizer, Stopwords Object, and Regex Pattern
def session_nltk_objects_init():

	wnl = WordNetLemmatizer()
	stop = set(stopwords.words('english'))
	pattern = r'(\w+)'

	return wnl, stop, pattern


# NER Preprocessing Functions
class NERSentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(), 
                                                           s['POS'].values.tolist(), 
                                                           s['Tag'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]

def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]


def prep_query_for_NER(phrase):
    split_query = re.findall(r"[\w']+|[.,!?;]", phrase)
    
    pos_tags = pos_tag(split_query)
    
    df_query = pd.DataFrame({'Sentence #':['Sentence: 1'] * len(pos_tags),
                            'Word':[pair[0] for pair in pos_tags],
                            'POS':[pair[1] for pair in pos_tags],
                            'Tag':[None] * len(pos_tags)})
       
    return df_query

In [55]:
def check_topics(topics_dict, primary_topic, secondary_topics):
    while True:
        # If the primary topic is an empty string and there are secondary topics...
        if (len(topics_dict[primary_topic]) == 0) & (len(secondary_topics) !=0):
            # Iterate through secondary topics
            for i in range(len(secondary_topics)):
                # Find the first secondary topic whose name is not an empty string
                if len(topics_dict[secondary_topics[i]]) != 0:
                    # Set that topic to primary status
                    primary_topic = secondary_topics.pop(i)
                    break
            else:
                break
        else:
            break
    return primary_topic, secondary_topics

In [14]:
wnl = WordNetLemmatizer()
wnl.lemmatize('cats')
stop = set(stopwords.words('english'))
pattern = r'(\w+)'

### Load Health Fact Dataset

In [159]:
# hf = datasets.load_dataset('health_fact')

### Load ag_news Dataset which has Sci/Tech Topics
---
- label: a classification label, with possible values including World (0), Sports (1), Business (2), Sci/Tech (3).


In [16]:
ag_news = datasets.load_dataset('ag_news')

Using custom data configuration default
Reusing dataset ag_news (/Users/spenno_fr/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


In [61]:
ag_news

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [59]:
ag_df = pd.DataFrame(ag_news['test'])

In [63]:
# 1900 Test Examples
ag_df[ag_df.label == 3].text.iloc[0]

'The Race is On: Second Private Team Sets Launch Date for Human Spaceflight (SPACE.com) SPACE.com - TORONTO, Canada -- A second\\team of rocketeers competing for the  #36;10 million Ansari X Prize, a contest for\\privately funded suborbital space flight, has officially announced the first\\launch date for its manned rocket.'

In [153]:
cleaned_ag_text_sample = ag_df.text.apply(lambda raw_text: clean_query(raw_text, pattern, stop, wnl))

In [154]:
predicted_topic_ids = cleaned_ag_text_sample.apply(lambda cleaned_text: topic_predict(cleaned_text, id2word, lda_model, topics_dict)[0])

In [66]:
# 0.65% out of 1900 training examples from ag_news on the 'sci/tech' category
predicted_topic_ids.apply(lambda topic_id: 1 if topics_dict[topic_id] in tech_labels else 0).sum()/predicted_topic_ids.shape[0]

0.6478947368421053

In [152]:
ag_df['y_true'] = ag_df.label.apply(lambda val: np.where(val == 3, 1, 0))

In [147]:
y_true = np.ones(ag_df[ag_df.label == 3].shape[0])

In [156]:
ag_df['y_pred'] = predicted_topic_ids.apply(lambda topic_id: 1 if topics_dict[topic_id] in tech_labels else 0)

In [157]:
target_names = ['Other', 'Technology']
print(classification_report(ag_df['y_true'], ag_df['y_pred'], target_names=target_names))

              precision    recall  f1-score   support

       Other       0.88      0.86      0.87      5700
  Technology       0.61      0.64      0.63      1900

    accuracy                           0.81      7600
   macro avg       0.74      0.75      0.75      7600
weighted avg       0.81      0.81      0.81      7600



### Load s2orc Dataset

In [184]:
# s2orc = datasets.load_dataset('s2orc')

### Load Grail QA

In [70]:
grail_qa = datasets.load_dataset('grail_qa')

Using custom data configuration default
Reusing dataset grail_qa (/Users/spenno_fr/.cache/huggingface/datasets/grail_qa/default/0.0.0/2be99c6902e842f3ba87dd36fa96a2236206438ebb67c5e5464a36e4049fd3fb)


In [71]:
grail_qa

DatasetDict({
    train: Dataset({
        features: ['qid', 'question', 'answer', 'function', 'num_node', 'num_edge', 'graph_query', 'sparql_query', 'domains', 'level', 's_expression'],
        num_rows: 44337
    })
    validation: Dataset({
        features: ['qid', 'question', 'answer', 'function', 'num_node', 'num_edge', 'graph_query', 'sparql_query', 'domains', 'level', 's_expression'],
        num_rows: 6763
    })
    test: Dataset({
        features: ['qid', 'question', 'answer', 'function', 'num_node', 'num_edge', 'graph_query', 'sparql_query', 'domains', 'level', 's_expression'],
        num_rows: 13231
    })
})

In [74]:
grail_qa_df = pd.DataFrame(grail_qa['validation'])

In [122]:
grail_qa_df.shape

(6763, 16)

In [92]:
# Clean the Domain Label
grail_qa_df['cleaned_domain'] = grail_qa_df.domains.apply(lambda domain_list: domain_list[0])

In [96]:
def parse_healthcare_map(domain_string):
    
    if type(gqa_healthcare_mapper[domain_string]) == int:
        prediction = gqa_healthcare_mapper[domain_string]
    else:
        prediction = 0

    return prediction

In [104]:
def parse_technology_map(domain_string):
    
    if type(gqa_technology_mapper[domain_string]) == int:
        prediction = gqa_technology_mapper[domain_string]
    else:
        prediction = 0

    return prediction

In [100]:
grail_qa_df['healthcare_label'] = grail_qa_df.cleaned_domain.apply(lambda domain_string: parse_healthcare_map(domain_string))

In [105]:
grail_qa_df['technology_label'] = grail_qa_df.cleaned_domain.apply(lambda domain_string: parse_technology_map(domain_string))

In [132]:
# Parse Away the list brackets
grail_qa_df['domains_clean'] = grail_qa_df.domains.apply(lambda val: ', '.join([label for label in val]))
# str.replace('[','').head()

In [111]:
grail_qa_df['cleaned_text'] = grail_qa_df.question.apply(lambda raw_text: clean_query(raw_text, pattern, stop, wnl))

In [128]:
grail_qa_df['y_hat_topic_id'] = grail_qa_df['cleaned_text'].apply(lambda cleaned_text: topic_predict(cleaned_text, id2word, lda_model, topics_dict)[0])

In [136]:
# Performance Report
grail_qa_df['y_hat'] = grail_qa_df['y_hat_topic_id'].apply(lambda topic_id: np.where(topics_dict[topic_id] in health_labels, 0, np.where(topics_dict[topic_id] in tech_labels, 1, 2)))

In [139]:
target_names = ['Healthcare', 'Technology', 'Other']
print(classification_report(grail_qa_df['y'], grail_qa_df['y_hat'], target_names=target_names))

              precision    recall  f1-score   support

  Healthcare       0.26      0.66      0.38       304
  Technology       0.17      0.59      0.26       869
       Other       0.86      0.45      0.59      5590

    accuracy                           0.48      6763
   macro avg       0.43      0.57      0.41      6763
weighted avg       0.75      0.48      0.54      6763



In [175]:
from sklearn.metrics import accuracy_score
accuracy_score(grail_qa_df['y'], grail_qa_df['y_hat'])

0.4786337424220021

In [67]:
# Export Columns for Mapping to Technology, Healthcare, Other Domains
# grail_qa_df.domains_clean.value_counts().to_csv('./grail_qa_topics.csv')

In [85]:
grail_qa_healthcare_map = {'music':0,
'fictional_universe':0,
'book':0,
'medicine':1,
'computer':0,
'astronomy':0,
'people':0,
'sports':0,
'spaceflight':0,
'tv':0,
'biology':1,
'government':0,
'food':0,
'comic_books':0,
'education':0,
'aviation':0,
'business':0,
'time':0,
'architecture':0,
'religion':0,
'automotive':0,
'meteorology':0,
'amusement_parks':0,
'broadcast':0,
'digicams':0,
'law':0,
'geography':0,
'media_common':0,
'visual_art':0,
'theater':0,
'olympics':0,
'internet':0,
'boats':0,
'cricket':0,
'organization':0,
'travel':0,
'royalty':0,
'engineering':0,
'wine':0,
'opera':0,
'language':0,
'soccer':0,
'distilled_spirits':0,
'exhibitions':0,
'military':0,
'tennis':0,
'interests':0,
'skiing':0,
'protected_sites':0,
'ice_hockey':0,
'martial_arts':0,
'freebase':0,
'dining':0,
'basketball':0,
'transportation':0,
'base.lightweight':0,
'zoos':0,
'projects':0,
'physics':0,
'symbols':0,
'base.exoplanetology':0,
'common':0,
'bicycles':0,
'education, people':0,
'comic_strips':0,
'time, religion':0,
'celebrities':0,
'geology':0,
'fashion':0,
'chess':0,
'boxing':0,
'religion, people':0,
'fictional_universe, comic_books':0,
'food, wine':0,
'library':0,
'book, education':0,
'royalty, people':0,
'business, organization':0,
'soccer, time':0,
'dining, food':0,
'fictional_universe, media_common':0,
'book, media_common':0,
'book, periodicals':0,
'business, digicams':0,
'book, people':0,
'influence':0,
'dining, travel':0,
'media_common, tv':0,
'music, tv':0,
'business, people':0,
'theater, media_common':0,
'opera, music':0,
'book, fictional_universe':0,
'astronomy, base.exoplanetology':0,
'media_common, comic_books':0,
'user.patrick.default_domain':0,
'martial_arts, education, people':0,
'media_common, people':0,
'periodicals':0,
'dining, people':0,
'government, organization':0,
'transportation, travel':0,
'travel, organization':0,
'sports, broadcast, people':0,
'biology, people':0,
'architecture, travel':0,
'basketball, media_common':0,
'business, medicine':1,
'food, dining':0,
'theater, architecture':0,
'fictional_universe, sports':0,
'automotive, organization':0,
'time, military':0,
'broadcast, tv':0,
'language, tv':0,
'book, music':0,
'soccer, sports':0,
'interests, people':0,
'common, tv':0,
'user.jonathanwlowe.location':0,
'type':0,
'broadcast, education':0,
'engineering, projects':0,
'sports, ice_hockey':0,
'book, language':0,
'fictional_universe, people':0,
'fictional_universe, media_common, comic_books':0,
'biology, geology':0,
'time, media_common, spaceflight':0,
'medicine, organization':1,
'people, law':0,
'aviation, common':0,
'aviation, time':0,
'boxing, sports':0,
'astronomy, education':0,
'type, people':0,
'medicine, people':1,
'book, travel':0,
'comic_strips, media_common':0,
'soccer, media_common':0,
'comedy':0,
'opera, people':0,
'people, geography':0,
'time, olympics':0,
'biology, zoos':0,
'food, people':0}

gqa_healthcare_mapper = defaultdict(list, grail_qa_healthcare_map)

In [102]:
# gqa_healthcare_mapper['biologicalsciences']

In [103]:
grail_qa_technology_map = {'music':0,
'fictional_universe':0,
'book':0,
'medicine':0,
'computer':1,
'astronomy':1,
'people':0,
'sports':0,
'spaceflight':1,
'tv':1,
'biology':0,
'government':0,
'food':0,
'comic_books':0,
'education':0,
'aviation':1,
'business':0,
'time':0,
'architecture':1,
'religion':0,
'automotive':1,
'meteorology':1,
'amusement_parks':0,
'broadcast':1,
'digicams':1,
'law':0,
'geography':0,
'media_common':0,
'visual_art':0,
'theater':0,
'olympics':0,
'internet':1,
'boats':0,
'cricket':0,
'organization':0,
'travel':0,
'royalty':0,
'engineering':1,
'wine':0,
'opera':0,
'language':0,
'soccer':0,
'distilled_spirits':0,
'exhibitions':0,
'military':0,
'tennis':0,
'interests':0,
'skiing':0,
'protected_sites':0,
'ice_hockey':0,
'martial_arts':0,
'freebase':0,
'dining':0,
'basketball':0,
'transportation':0,
'base.lightweight':0,
'zoos':0,
'projects':0,
'physics':1,
'symbols':0,
'base.exoplanetology':1,
'common':0,
'bicycles':0,
'education, people':0,
'comic_strips':0,
'time, religion':0,
'celebrities':0,
'geology':0,
'fashion':0,
'chess':0,
'boxing':0,
'religion, people':0,
'fictional_universe, comic_books':0,
'food, wine':0,
'library':0,
'book, education':0,
'royalty, people':0,
'business, organization':0,
'soccer, time':0,
'dining, food':0,
'fictional_universe, media_common':0,
'book, media_common':0,
'book, periodicals':0,
'business, digicams':0,
'book, people':0,
'influence':0,
'dining, travel':0,
'media_common, tv':1,
'music, tv':0,
'business, people':0,
'theater, media_common':0,
'opera, music':0,
'book, fictional_universe':0,
'astronomy, base.exoplanetology':1,
'media_common, comic_books':0,
'user.patrick.default_domain':0,
'martial_arts, education, people':0,
'media_common, people':0,
'periodicals':0,
'dining, people':0,
'government, organization':0,
'transportation, travel':0,
'travel, organization':0,
'sports, broadcast, people':0,
'biology, people':0,
'architecture, travel':0,
'basketball, media_common':0,
'business, medicine':0,
'food, dining':0,
'theater, architecture':0,
'fictional_universe, sports':0,
'automotive, organization':0,
'time, military':0,
'broadcast, tv':1,
'language, tv':0,
'book, music':0,
'soccer, sports':0,
'interests, people':0,
'common, tv':0,
'user.jonathanwlowe.location':0,
'type':0,
'broadcast, education':0,
'engineering, projects':1,
'sports, ice_hockey':0,
'book, language':0,
'fictional_universe, people':0,
'fictional_universe, media_common, comic_books':0,
'biology, geology':0,
'time, media_common, spaceflight':0,
'medicine, organization':0,
'people, law':0,
'aviation, common':0,
'aviation, time':1,
'boxing, sports':0,
'astronomy, education':1,
'type, people':0,
'medicine, people':0,
'book, travel':0,
'comic_strips, media_common':0,
'soccer, media_common':0,
'comedy':0,
'opera, people':0,
'people, geography':0,
'time, olympics':0,
'biology, zoos':0,
'food, people':0}

gqa_technology_mapper = defaultdict(list, grail_qa_technology_map)

In [107]:
# Build One Column for y_label
grail_qa_df['y'] = np.where(grail_qa_df['healthcare_label'] == 1, 0, 
                     np.where(grail_qa_df['technology_label'] == 1, 1, 2))

In [None]:
# class_map = {'healthcare':0, 'technology':1, 'other':2}

### Load sd-nlp
---
- The dataset is pre-tokenized with the roberta-base tokenizer

In [102]:
embo_ner = datasets.load_dataset('EMBO/sd-nlp')

Downloading:   0%|          | 0.00/8.22k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/6.90k [00:00<?, ?B/s]

No config specified, defaulting to: source_data_nlp/NER


Downloading and preparing dataset source_data_nlp/NER (download: 21.46 MiB, generated: 56.37 MiB, post-processed: Unknown size, total: 77.83 MiB) to /Users/spenno_fr/.cache/huggingface/datasets/source_data_nlp/NER/0.0.1/1ae41bd5b05b4d21841533f784520bd1db4cf6e954412d017d51db42395908ae...


Downloading:   0%|          | 0.00/22.5M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset source_data_nlp downloaded and prepared to /Users/spenno_fr/.cache/huggingface/datasets/source_data_nlp/NER/0.0.1/1ae41bd5b05b4d21841533f784520bd1db4cf6e954412d017d51db42395908ae. Subsequent calls will reuse this data.


In [104]:
embo_ner

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'tag_mask'],
        num_rows: 31410
    })
    test: Dataset({
        features: ['input_ids', 'labels', 'tag_mask'],
        num_rows: 4224
    })
    validation: Dataset({
        features: ['input_ids', 'labels', 'tag_mask'],
        num_rows: 8861
    })
})

In [105]:
embo_ner_df = pd.DataFrame(embo_ner['train'])

In [106]:
embo_ner_df.head()

Unnamed: 0,input_ids,labels,tag_mask
0,"[0, 1640, 387, 43, 3385, 261, 13886, 4590, 58,...","[0, 0, 0, 0, 8, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[0, 250, 12, 347, 27490, 41601, 39287, 672, 36...","[0, 0, 0, 0, 0, 6, 5, 0, 0, 0, 0, 2, 1, 1, 0, ...","[0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, ..."
2,"[0, 6842, 597, 14660, 21, 3032, 19, 1308, 438,...","[0, 0, 0, 0, 0, 0, 0, 4, 3, 0, 4, 3, 3, 3, 4, ...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, ..."
3,"[0, 2716, 12, 406, 102, 34496, 13458, 5, 83, 1...","[0, 4, 3, 3, 3, 0, 0, 0, 4, 3, 3, 14, 13, 13, ...","[0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, ..."
4,"[0, 1640, 571, 43, 4680, 42825, 24756, 246, 12...","[0, 0, 0, 0, 0, 0, 4, 3, 0, 0, 0, 0, 8, 7, 0, ...","[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, ..."


### Load medal Dataset

In [177]:
medal = datasets.load_dataset('medal')

In [181]:
# !wget https://drive.google.com/file/d/1eTtRs5cUlBP5dXsx-FTAlmXuB6JQi2qj/view

### Load Hybrid QA Dataset

In [160]:
hqa = datasets.load_dataset('hybrid_qa')

Reusing dataset hybrid_qa (/Users/spenno_fr/.cache/huggingface/datasets/hybrid_qa/hybrid_qa/1.0.0/fabdc38783449dd6cb1acd25621af97b871e218fc3ab608191d492b408a93ab8)


In [161]:
hqa

DatasetDict({
    train: Dataset({
        features: ['question_id', 'question', 'table_id', 'answer_text', 'question_postag', 'table'],
        num_rows: 62682
    })
    validation: Dataset({
        features: ['question_id', 'question', 'table_id', 'answer_text', 'question_postag', 'table'],
        num_rows: 3466
    })
    test: Dataset({
        features: ['question_id', 'question', 'table_id', 'answer_text', 'question_postag', 'table'],
        num_rows: 3463
    })
})

In [162]:
hqa_df = pd.DataFrame(hqa['train'])

In [163]:
hqa_df.head()

Unnamed: 0,question_id,question,table_id,answer_text,question_postag,table
0,00009b9649d0dd0a,Who were the builders of the mosque in Herat w...,List_of_mosques_in_Afghanistan_0,Ghurids,WP VBD DT NNS IN DT NN IN NNP IN NN NNS .,{'url': 'https://en.wikipedia.org/wiki/List_of...
1,00013190d4370f73,After what season did the number 7 competitor ...,1999_Kyalami_Superbike_World_Championship_round_0,2006,IN WP NN VBD DT NN CD NN NN .,{'url': 'https://en.wikipedia.org/wiki/1999_Ky...
2,00023988273478d0,What year was the 1971-72 ECAC Hockey Player o...,List_of_ECAC_Hockey_Player_of_the_Year_0,1950,WP NN VBD DT JJ NNP NNP NNP IN DT NNP VBN .,{'url': 'https://en.wikipedia.org/wiki/List_of...
3,000356071ebf888a,What battle did the man born on 7 December 183...,List_of_Zulu_War_Victoria_Cross_recipients_0,Battle of Hlobane,WP NN VBD DT NN VBN IN CD NNP CD NN IN .,{'url': 'https://en.wikipedia.org/wiki/List_of...
4,0003d159df86ed53,What is the borough in which Kia Oval is locat...,List_of_sports_venues_with_sole_naming_rights_36,Lambeth,WP VBZ DT NN IN WDT NNP NNP VBZ VBN .,{'url': 'https://en.wikipedia.org/wiki/List_of...


### Load mqp Dataset

In [71]:
mqp = datasets.load_dataset('medical_questions_pairs')

Downloading:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/569 [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset medical_questions_pairs/default (download: 650.09 KiB, generated: 685.21 KiB, post-processed: Unknown size, total: 1.30 MiB) to /Users/spenno_fr/.cache/huggingface/datasets/medical_questions_pairs/default/0.0.0/db30a35b934dceb7abed5ef6b73a432bb59682d00e26f9a1acd960635333bc80...


Downloading:   0%|          | 0.00/174k [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

Dataset medical_questions_pairs downloaded and prepared to /Users/spenno_fr/.cache/huggingface/datasets/medical_questions_pairs/default/0.0.0/db30a35b934dceb7abed5ef6b73a432bb59682d00e26f9a1acd960635333bc80. Subsequent calls will reuse this data.


In [74]:
mqp_df = pd.DataFrame(mqp['train'])

In [75]:
mqp_df

Unnamed: 0,dr_id,question_1,question_2,label
0,1,After how many hour from drinking an antibioti...,I have a party tonight and I took my last dose...,1
1,1,After how many hour from drinking an antibioti...,I vomited this morning and I am not sure if it...,0
2,1,Am I over weight (192.9) for my age (39)?,I am a 39 y/o male currently weighing about 19...,1
3,1,Am I over weight (192.9) for my age (39)?,What diet is good for losing weight? Keto or v...,0
4,1,Aspirin allergy - is it worth getting a bracelet?,How much Aspirin can I take for my headache wi...,0
...,...,...,...,...
3043,11,15 million sperm can be a father?,My sperm count is 15 millions and is there a c...,1
3044,11,26 yo m with history of progressive venous ins...,"Hello doctor, can you please tell me some of t...",0
3045,11,26 yo m with history of progressive venous ins...,"Hello doctor, I am 26 year old male wth progre...",1
3046,11,32 weeks pregnant. Headache strange leg weakne...,I am 32 weeks pregnant and had severe headache...,0


### Load XGLUE Dataset with `NC` News Classification

In [164]:
xglue = datasets.load_dataset('xglue', 'nc')

Reusing dataset x_glue (/Users/spenno_fr/.cache/huggingface/datasets/x_glue/nc/1.0.0/9d181dd4413e6dfb0502b603c5ced10d2988d7bc97dd4c899eabee4397234139)


In [165]:
xglue.shape

{'train': (100000, 3),
 'validation.en': (10000, 3),
 'validation.de': (10000, 3),
 'validation.es': (10000, 3),
 'validation.fr': (10000, 3),
 'validation.ru': (10000, 3),
 'test.en': (10000, 3),
 'test.de': (10000, 3),
 'test.es': (10000, 3),
 'test.fr': (10000, 3),
 'test.ru': (10000, 3)}

In [166]:
# Example Instance
xglue_df = pd.DataFrame(xglue['train'])

In [169]:
xglue_df.head()

Unnamed: 0,news_title,news_body,news_category
0,Lebanese Grilled Fish,Lebanese Grilled Fish,0
1,How to line a tart tin,Lining a tart tin is an important step in the ...,0
2,How to make pancake batter,Great British Chefs demonstrates how to make p...,0
3,How to poach an egg,Master the art of poaching an egg with the hel...,0
4,How to ice a cupcake,"In this video, Great British Chefs demonstrate...",0


In [167]:
# Classification Dictionary
{0:'cooking', 1:'sports', 2:'travel', 3:'business_news', 4:'unknown', 5:'entertainment', 6:'celebrities', 7:'healthcare', 8:'', 9:'transportation'}

{0: 'cooking',
 1: 'sports',
 2: 'travel',
 3: 'business_news',
 4: 'unknown',
 5: 'entertainment',
 6: 'celebrities',
 7: 'healthcare',
 8: '',
 9: 'transportation'}

In [173]:
# 
xglue_df[xglue_df['news_category'] == 9]['news_body'][0:50]

45     Have you ever flipped through an airline magaz...
50     Twenty years ago, in the August 1994 issue of ...
53     Because the current Jeep Wrangler suffers from...
54     I spend a lot of time commuting from home to t...
55     When we attended the first media event for the...
56     Nissan has released details of the updates to ...
59     The 2015 Volvo XC90 tease continues , this tim...
60     While recent internet rumors suggested the Sub...
61     The two turbos hanging off the F80 M3's 3.0-li...
62     Take a look at the nose of the new Bentley Fly...
63     All the attention has been focused on the 2015...
64     Official power figures for the 2014 Callaway C...
66     Although the Mitsubishi Lancer Evolution has b...
67     A picture of Michelangelo's David, even in the...
68     Vehicle Overview Minor changes are in store fo...
69     NOTE: This story first appeared in the June 19...
70     Like other Toyota trucks and SUVs, the 2010 4R...
73     The new Infiniti M, seen

### NER Dataset

In [8]:
test_sentence = xglue['train']['words'][0]

In [6]:
from transformers import BertTokenizer

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [7]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [11]:
# help(tokenizer)

In [12]:
# Tokenize the sample test sentence intp a numpy array with HuggingFace BertTokenizer
tokenizer(test_sentence, padding="longest", truncation=True, max_length=512, return_tensors="np")

{'input_ids': array([[  101,  7327,   102],
       [  101, 19164,   102],
       [  101,  2446,   102],
       [  101,  2655,   102],
       [  101,  2000,   102],
       [  101, 17757,   102],
       [  101,  2329,   102],
       [  101, 12559,   102],
       [  101,  1012,   102]]), 'token_type_ids': array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]]), 'attention_mask': array([[1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1]])}

In [1]:
!pwd

/Users/spenno_fr/Projects/nlp-glg-mas/notebooks
