In [None]:
"""
Spacy has different models: en_core_web_lg, en_core_web_md, en_core_web_sm.
To download one of them:

python -m spacy download en_core_web_sm 
"""

In [134]:
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")

text = 'Fake news has altered society in negative ways in politics and culture'
doc = nlp(text)

tokens = []

for token in doc:
    t = token 
    obj = {
        # Tokenization
        'text': t.text,
        # Lemmatization
        'lemma_': t.lemma_,
        # POS tagging
        'tag_': t.tag_, # Fine grained
        'pos_': t.pos_, # Coarse grained
        'dep_': t.dep_,
        'shape_': t.shape_,
        'is_alpha': t.is_alpha,
        'is_stop': t.is_stop,
        # Morphology
        'morph': str(t.morph),
        # Parse tree navigation 
        'children': str(t.children)
    }
    tokens.append(obj)


# Mode of lemmatization
lemmatizer = nlp.get_pipe("lemmatizer")
print(lemmatizer.mode)
# Tokens 
print(json.dumps(tokens, indent=2))
displacy.render(doc, jupyter=True)

rule
[
  {
    "text": "Fake",
    "lemma_": "fake",
    "tag_": "JJ",
    "pos_": "ADJ",
    "dep_": "amod",
    "shape_": "Xxxx",
    "is_alpha": true,
    "is_stop": false,
    "morph": "Degree=Pos",
    "children": "<generator object at 0x17bb3d180>"
  },
  {
    "text": "news",
    "lemma_": "news",
    "tag_": "NN",
    "pos_": "NOUN",
    "dep_": "nsubj",
    "shape_": "xxxx",
    "is_alpha": true,
    "is_stop": false,
    "morph": "Number=Sing",
    "children": "<generator object at 0x17bb3d180>"
  },
  {
    "text": "has",
    "lemma_": "have",
    "tag_": "VBZ",
    "pos_": "AUX",
    "dep_": "aux",
    "shape_": "xxx",
    "is_alpha": true,
    "is_stop": true,
    "morph": "Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin",
    "children": "<generator object at 0x17bb3d180>"
  },
  {
    "text": "altered",
    "lemma_": "alter",
    "tag_": "VBN",
    "pos_": "VERB",
    "dep_": "ROOT",
    "shape_": "xxxx",
    "is_alpha": true,
    "is_stop": false,
    "morph": "As

In [135]:
# Noun chuncks 

import spacy
nlp = spacy.load("en_core_web_sm")

text = 'Fake news has altered society in negative ways in politics and culture'
doc = nlp(text)

chunks = []

for chunk in doc.noun_chunks:
    c = chunk 
    obj = {
        'text': c.text,
        'root': {
            'text': c.root.text,
            'dep_': c.root.dep_,
            'head': {
                'text': c.root.head.text
            }
        }
    }
    chunks.append(obj)
    
print(json.dumps(chunks, indent=2))

[
  {
    "text": "Fake news",
    "root": {
      "text": "news",
      "dep_": "nsubj",
      "head": {
        "text": "altered"
      }
    }
  },
  {
    "text": "society",
    "root": {
      "text": "society",
      "dep_": "dobj",
      "head": {
        "text": "altered"
      }
    }
  },
  {
    "text": "negative ways",
    "root": {
      "text": "ways",
      "dep_": "pobj",
      "head": {
        "text": "in"
      }
    }
  },
  {
    "text": "politics",
    "root": {
      "text": "politics",
      "dep_": "pobj",
      "head": {
        "text": "in"
      }
    }
  },
  {
    "text": "culture",
    "root": {
      "text": "culture",
      "dep_": "conj",
      "head": {
        "text": "politics"
      }
    }
  }
]


In [136]:
# Named Entity Recognition

import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")

text = "Apple is looking at buying U.K. startup for $1 billion"
doc = nlp(text)

# Document Level Entities
displacy.render(doc.ents, jupyter=True, style='ent')
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)
    

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [137]:
# Sentences splitting
import spacy
nlp = spacy.load("en_core_web_sm")

text = 'Fake news has altered society in negative ways in politics and culture. It has adversely affected both online social network systems as well as offline communities and conversations. Using automatic machine learning classification models is an efficient way to combat the widespread dissemination of fake news. However, a lack of effective, comprehensive datasets has been a problem for fake news research and detection model development. Prior fake news datasets do not provide multimodal text and image data, metadata, comment data, and fine-grained fake news categorization at the scale and breadth of our dataset. We present Fakeddit, a novel multimodal dataset consisting of over 1 million samples from multiple categories of fake news. After being processed through several stages of review, the samples are labeled according to 2-way, 3-way, and 6-way classification categories through distant supervision. We construct hybrid text+image models and perform extensive experiments for multiple variations of classification, demonstrating the importance of the novel aspect of multimodality and fine-grained classification unique to Fakeddit.'
doc = nlp(text)
for i, sent in enumerate(doc.sents):
    print(i, sent.text)

0 Fake news has altered society in negative ways in politics and culture.
1 It has adversely affected both online social network systems as well as offline communities and conversations.
2 Using automatic machine learning classification models is an efficient way to combat the widespread dissemination of fake news.
3 However, a lack of effective, comprehensive datasets has been a problem for fake news research and detection model development.
4 Prior fake news datasets do not provide multimodal text and image data, metadata, comment data, and fine-grained fake news categorization at the scale and breadth of our dataset.
5 We present Fakeddit, a novel multimodal dataset consisting of over 1 million samples from multiple categories of fake news.
6 After being processed through several stages of review, the samples are labeled according to 2-way, 3-way, and 6-way classification categories through distant supervision.
7 We construct hybrid text+image models and perform extensive experiment

In [143]:
# Triplet exptraction (entity, relation, entity)

import spacy
nlp = spacy.load("en_core_web_sm")
from spacy.matcher import Matcher 
from spacy.tokens import Span 

text = "Fake news has altered society in negative ways in politics and culture."

#for tok in nlp(text):
#    print(tok.text, "...", tok.dep_)

def get_entities(sent):
    ## chunk 1
    ent1 = ""
    ent2 = ""

    prv_tok_dep = ""    # dependency tag of previous token in the sentence
    prv_tok_text = ""   # previous token in the sentence

    prefix = ""
    modifier = ""

    #############################################################

    for tok in nlp(sent):
        ## chunk 2
        # if token is a punctuation mark then move on to the next token
        if tok.dep_ != "punct":
            # check: token is a compound word or not
            if tok.dep_ == "compound":
                prefix = tok.text
            # if the previous word was also a 'compound' then add the current word to it
            if prv_tok_dep == "compound":
                prefix = prv_tok_text + " "+ tok.text

            # check: token is a modifier or not
            if tok.dep_.endswith("mod") == True:
                modifier = tok.text
            # if the previous word was also a 'compound' then add the current word to it
            if prv_tok_dep == "compound":
                modifier = prv_tok_text + " "+ tok.text

            ## chunk 3
            if tok.dep_.find("subj") == True:
                ent1 = modifier +" "+ prefix + " "+ tok.text
                prefix = ""
                modifier = ""
                prv_tok_dep = ""
                prv_tok_text = ""      

            ## chunk 4
            if tok.dep_.find("obj") == True:
                ent2 = modifier +" "+ prefix +" "+ tok.text

            ## chunk 5  
            # update variables
            prv_tok_dep = tok.dep_
            prv_tok_text = tok.text
    #############################################################

    return [ent1.strip(), ent2.strip()]

def get_relation(sent):
    doc = nlp(sent)
    matcher = Matcher(nlp.vocab)
    #define the pattern 
    pattern = [{'DEP':'ROOT'}, 
            {'DEP':'prep','OP':"?"},
            {'DEP':'agent','OP':"?"},  
            {'POS':'ADJ','OP':"?"}] 

    matcher.add("matching_1", [pattern]) 
    matches = matcher(doc)
    k = len(matches) - 1
    span = doc[matches[k][1]:matches[k][2]] 
    return(span.text)

relation = get_relation(text)
entities = get_entities(text)
triplet = [entities[0], relation, entities[1]]
print(triplet)

['Fake  news', 'altered', 'negative  politics']


In [106]:
# Word counting 
import spacy
from collections import Counter
doc = nlp('The importance of building semantic parsers which can be applied to new domains and generate programs unseen at training has long been acknowledged, and datasets testing out-of-domain performance are becoming increasingly available. However, little or no attention has been devoted to learning algorithms or objectives which promote domain generalization, with virtually all existing approaches relying on standard supervised learning. In this work, we use a meta-learning framework which targets zero-shot domain generalization for semantic parsing. We apply a model-agnostic training algorithm that simulates zero-shot parsing by constructing virtual train and test sets from disjoint domains. The learning objective capitalizes on the intuition that gradient steps that improve source-domain performance should also improve target-domain performance, thus encouraging a parser to generalize to unseen target domains. Experimental results on the (English) Spider and Chinese Spider datasets show that the meta-learning objective significantly boosts the performance of a baseline parser.')
# Remove stop words and punctuation symbols
words = [token.text for token in doc if not token.is_stop and not token.is_punct]
word_freq = Counter(words)
# 5 commonly occurring words with their frequencies
common_words = word_freq.most_common(10)
common_words

[('domain', 5),
 ('learning', 5),
 ('performance', 4),
 ('domains', 3),
 ('semantic', 2),
 ('unseen', 2),
 ('training', 2),
 ('datasets', 2),
 ('generalization', 2),
 ('meta', 2)]

In [176]:
# Hot words 

import spacy
from collections import Counter
from string import punctuation

def get_hotwords(text, most_common=10):
    result = []
    pos_tag = ['PROPN', 'ADJ', 'NOUN'] # 1
    doc = nlp(text.lower()) # 2
    for token in doc:
        # 3
        if(token.text in nlp.Defaults.stop_words or token.text in punctuation):
            continue
        # 4
        if(token.pos_ in pos_tag):
            result.append(token.text)
                
    return Counter(result).most_common(10) # 5

# Title of this abstract text: Meta-Learning for Domain Generalization in Semantic Parsing
text = 'The importance of building semantic parsers which can be applied to new domains and generate programs unseen at training has long been acknowledged, and datasets testing out-of-domain performance are becoming increasingly available. However, little or no attention has been devoted to learning algorithms or objectives which promote domain generalization, with virtually all existing approaches relying on standard supervised learning. In this work, we use a meta-learning framework which targets zero-shot domain generalization for semantic parsing. We apply a model-agnostic training algorithm that simulates zero-shot parsing by constructing virtual train and test sets from disjoint domains. The learning objective capitalizes on the intuition that gradient steps that improve source-domain performance should also improve target-domain performance, thus encouraging a parser to generalize to unseen target domains. Experimental results on the (English) Spider and Chinese Spider datasets show that the meta-learning objective significantly boosts the performance of a baseline parser.'
get_hotwords(text)



[('domain', 5),
 ('performance', 4),
 ('domains', 3),
 ('semantic', 2),
 ('unseen', 2),
 ('training', 2),
 ('datasets', 2),
 ('generalization', 2),
 ('meta', 2),
 ('shot', 2)]

In [130]:
# Extractive QA (very slow, uses BERT)
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

model_name = "deepset/roberta-base-squad2"

nlp = pipeline('question-answering', model=model_name, tokenizer=model_name) 
res = nlp({
    'question': 'Why is model conversion important?',
    'context': 'The option to convert models between FARM and transformers gives freedom to the user and let people easily switch between frameworks.'
})
res

{'score': 0.21171444654464722,
 'start': 59,
 'end': 84,
 'answer': 'gives freedom to the user'}

In [177]:
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")

doc = nlp("The importance of building semantic parsers which can be applied to new domains and generate programs unseen at training has long been acknowledged, and datasets testing out-of-domain performance are becoming increasingly available.")
displacy.render(doc, style="dep")
doc = nlp("Beatutiful big blue ideas")
displacy.render(doc, style="dep")
doc = nlp("The color can be red")
displacy.render(doc, style="dep")

In [168]:
import spacy
nlp = spacy.load("en_core_web_sm")

# Add neural coref to SpaCy's pipe
import neuralcoref
neuralcoref.add_to_pipe(nlp)

# You're done. You can now use NeuralCoref as you usually manipulate a SpaCy document annotations.
doc = nlp(u'My sister has a dog. She loves him.')

doc._.has_coref
doc._.coref_clusters

ModuleNotFoundError: No module named 'neuralcoref'

In [172]:
print(1,2) 
print(2)

1 2
2
