In [26]:
import spacy
import pandas as pd
from itertools import combinations as combs
from spacy.matcher import Matcher

## Data Pipeline

In [21]:
nlp = spacy.load('en_core_web_sm')

doc1  = nlp(u'An Englishman, a Scotsman and an Irishman walk into a bar. The Englishman wanted to go so they all had to leave. #Brexitjokes')
doc2  = nlp(u'Why do we need any colour passport? We should just be able to shout, “British! Less of your nonsense!” and stroll straight through.')
doc3  = nlp(u'Q: With Britain leaving the EU how much space was created? A: Exactly 1GB')
doc4  = nlp(u'VOTERS: we want to give a boat a ridiculous name UK: no VOTERS: we want to break up the EU and trash the world economy UK: fine')
doc5  = nlp(u'#BrexitJokes How did the Brexit chicken cross the road? \"I never said there was a road. Or a chicken\".')
doc6  = nlp(u'After #brexit, when rapper 50 cent performs in GBR he\'ll appear as 10.00 pounds. #brexitjokes')
doc7  = nlp(u'I long for the simpler days when #Brexit was just a term for leaving brunch early.')
doc8  = nlp(u'Say goodbye to croissants, people. Delicious croissants. We\'re stuck with crumpets FOREVER.')
doc9  = nlp(u'Hello, I am from Britain, you know, the one that got tricked by a bus')
doc10 = nlp(u'How many Brexiteers does it take to change a light bulb? None, they are all walked out because they didn’t like the way the electrician did it.')

docs = [
    doc1,
    doc2,
    doc3,
    doc4,
    doc5,
    doc6,
    doc7,
    doc8,
    doc9,
    doc10]

## Part of Speach Tagging

## Named Entity Recognition

In [16]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(f'{ent.text} - {ent.label_} - {spacy.explain(ent.label_)}')
    else:
        print('No entites found')

In [23]:
tweet_no = 1
for doc in docs:
    print(f'Tweet: {tweet_no}')
    show_ents(doc)
    print('\n')
    tweet_no += 1

Tweet: 1
Scotsman - PERSON - People, including fictional
Irishman - NORP - Nationalities or religious or political groups
Englishman - PERSON - People, including fictional


Tweet: 2
British - NORP - Nationalities or religious or political groups


Tweet: 3
Britain - GPE - Countries, cities, states
EU - ORG - Companies, agencies, institutions, etc.


Tweet: 4
UK - GPE - Countries, cities, states
EU - ORG - Companies, agencies, institutions, etc.


Tweet: 5
Brexit - PERSON - People, including fictional


Tweet: 6
50 cent - MONEY - Monetary values, including unit
10.00 pounds - MONEY - Monetary values, including unit


Tweet: 7
the simpler days - DATE - Absolute or relative dates or periods
Brexit - PERSON - People, including fictional


Tweet: 8
FOREVER - WORK_OF_ART - Titles of books, songs, etc.


Tweet: 9
Britain - GPE - Countries, cities, states


Tweet: 10
Brexiteers - WORK_OF_ART - Titles of books, songs, etc.




## Feature Extraction

## Sentiment Analysis

## Tweet Similarity Scoring

In [8]:
spans = {}

In [9]:
for j,doc in enumerate(docs):
    named_entity_span = [doc[i].text for i in range(len(doc)) if doc[i].ent_type != 0]
    print(named_entity_span)
    named_entity_span = ' '.join(named_entity_span)
    named_entity_span = nlp(named_entity_span)
    spans.update({j:named_entity_span})

['Scotsman', 'Irishman', 'Englishman']
['British']
['Britain', 'EU']
['UK', 'EU']
['Brexit']
['50', 'cent', '10.000', 'pounds']
['the', 'simpler', 'days', 'Brexit']
['FOREVER']
['Britain']
['Brexiteers']


In [15]:
df = pd.DataFrame()

tweet_id = [i for i in range(1,11)]
id_combs = list(combs(tweet_id, 2))

for each_pair in id_combs:
    similarity = spans[each_pair[0]-1].similarity(spans[each_pair[1]-1])
    #print(f'doc{each_pair[0]} is similar to doc{each_pair[1]} by: {similarity}') #Un-comment if you want to see individual scores printed.
    results = {
        'tweet1': int(each_pair[0]),
        'tweet2': int(each_pair[1]),
        'similarity': similarity
    }
    
    df = df.append(results, ignore_index=True)

  similarity = spans[each_pair[0]-1].similarity(spans[each_pair[1]-1])


In [11]:
# Chaning Data Types
df['tweet1'] = df['tweet1'].astype(int)
df['tweet2'] = df['tweet2'].astype(int)

In [None]:
# Saving to/loading from CSV
#df = pd.read_csv('similarity_scores.csv') #Uncomment to load.
#df.to_csv('similarity_scores.csv') #Uncomment to resave.

In [None]:
df_ordered = df.sort_values(by=['similarity'], ascending=False)

In [12]:
# Display the Top 10 Simialr Combinations 
df_ordered.head(10)

Unnamed: 0,similarity,tweet1,tweet2
17,0.857896,3,4
1,0.788178,1,3
33,0.771924,5,9
2,0.720223,1,4
18,0.68895,3,5
22,0.64652,3,9
24,0.598866,4,5
16,0.549264,2,10
7,0.51066,1,9
3,0.510251,1,5


In [13]:
# Display the Bottom 10 Simialr Combinations 
df_ordered.tail(10)

Unnamed: 0,similarity,tweet1,tweet2
6,0.198919,1,8
30,0.196068,5,6
39,0.185533,7,8
19,0.132649,3,6
4,0.128754,1,6
41,0.124216,7,10
25,0.090598,4,6
38,0.069899,6,10
36,0.055461,6,8
12,0.001826,2,6


## Utterence Pattern Matching

In [24]:
def dep_pattern(doc):
    for i in range(len(doc)-1):
        if doc[i].dep_ == 'nsubj' and doc[i+1].dep_ == 'aux' and doc[i+2].dep_ == 'ROOT':
            for tok in doc[i+2].children:
                if tok.dep_ == 'dobj':
                    return True
    else:
        return False

In [25]:
for i in docs:
    if dep_pattern(i):
        print('Found')
    else:
        print('Not Found')

Not Found
Not Found
Not Found
Not Found
Not Found
Found
Not Found
Not Found
Not Found
Not Found


## Finding Word Sequence Patterns

In [30]:
matcher = Matcher(nlp.vocab)
pattern = [{'DEP':"nsubj"}, {"DEP":"aux"}, {"DEP":"ROOT"}]
matcher.add("NsubjAuxRoot", [pattern])

tweet_no = 1

for doc in docs:
    matches = matcher(doc)
    print(f'Tweet: {tweet_no}')
    for match_id, start, end in matches:
        span = doc[start:end]
        print(f"Span: {span.text}")
        print(f"The position in the doc are: {start} - {end}\n")
    else:
        print("None found.\n")
    tweet_no += 1

Tweet: 1
None found.

Tweet: 2
None found.

Tweet: 3
None found.

Tweet: 4
None found.

Tweet: 5
None found.

Tweet: 6
Span: he'll appear
The position in the doc are: 11 - 14

None found.

Tweet: 7
None found.

Tweet: 8
None found.

Tweet: 9
None found.

Tweet: 10
None found.

