# Explore **spacy** package
Felix Zaussinger | 06.01.2021

## Core Analysis Goal(s)
1. Discover core functionalities of spacy library
2. Test pre-processing chains on example documents from BBC Monitoring
3. Test simple models

## Key Insight(s)
1.
2.
3.

In [11]:
%load_ext autoreload
%autoreload 2

import os
import sys
import logging
from pathlib import Path
import numpy as np
import scipy as sp
import statsmodels.api as sm
from statsmodels.formula.api import ols
import glob
import os
import spacy
import re
import pandas as pd
import numpy as np
import gensim
from gensim.utils import simple_preprocess
import gensim.corpora as corpora
from pprint import pprint
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import seaborn as sns
sns.set_context("poster")
sns.set(rc={'figure.figsize': (16, 9.)})
sns.set_style("ticks")

import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)

logging.basicConfig(level=logging.INFO, stream=sys.stdout)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Define directory structure

In [2]:
# project directory
abspath = os.path.abspath('')
project_dir = str(Path(abspath).parents[0])

# sub-directories
data_raw = os.path.join(project_dir, "data", "raw")
data_interim = os.path.join(project_dir, "data", "interim")
data_processed = os.path.join(project_dir, "data", "processed")
figure_dir = os.path.join(project_dir, "plots")

Code ...

In [3]:
# init language object
nlp = spacy.load('en_core_web_lg')

In [18]:
# build file list
dp_string = os.path.join(data_raw, "BBC_2007_07_04_TXT", "*.txt")
file_list = glob.glob(dp_string)
file_list = sorted(file_list)

# read text
n = 5
corpus = []
for file_path in file_list[:n]:
    with open(file_path) as f_input:

        # read
        text = f_input.read()

        # remove new lines
        text = re.sub('\s+', ' ', text)

        # remove urls
        text = re.sub(r'http\S+', '', text)

        # remove non-letter chars
        text = re.sub('[^a-zA-Z]', ' ', text)

        # remove single quotes
        text = re.sub("\'", '', text)

        #append
        corpus.append(text)

https://gist.github.com/SandieIJ/69fc80c372e823fecfd4eeeda2156936

In [19]:
#break down sentences into words
def sent_to_words(sentences):
    for sentence in sentences:
        yield gensim.utils.simple_preprocess(str(sentence), deacc=True, min_len=2, max_len=15)

doc = corpus[0]

# tokenize
data_words = list(sent_to_words(doc))

# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[doc], threshold=100)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

def make_bigrams(texts):
    return [bigram_mod[text] for text in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[text]] for text in texts]

# Form Bigrams
data_words_bigrams = make_bigrams(data_words)

INFO:gensim.models.phrases:collecting all words and their counts
INFO:gensim.models.phrases:PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO:gensim.models.phrases:collected 0 word types from a corpus of 0 words (unigram + bigrams) and 3878 sentences
INFO:gensim.models.phrases:using 0 counts as vocab in Phrases<0 vocab, min_count=5, threshold=100, max_vocab_size=40000000>
INFO:gensim.models.phrases:collecting all words and their counts
INFO:gensim.models.phrases:PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO:gensim.models.phrases:collected 49 word types from a corpus of 3878 words (unigram + bigrams) and 3878 sentences
INFO:gensim.models.phrases:using 49 counts as vocab in Phrases<0 vocab, min_count=5, threshold=100, max_vocab_size=40000000>
INFO:gensim.models.phrases:source_vocab length 0
INFO:gensim.models.phrases:Phraser built with 0 phrasegrams
INFO:gensim.models.phrases:source_vocab length 49
INFO:gensim.models.phrases:Phraser built with 0 phrase

In [21]:
len(data_words_bigrams)

3878

In [6]:
%%time
doc = nlp(corpus[0])
for doc in nlp.pipe(corpus, disable=["tagger", "parser"]):
    # Do something with the doc here
    print([(ent.text, ent.label_) for ent in doc.ents])

[('Nigeria', 'GPE'), ('Pan-Niger Delta', 'NORP'), ('Akpa Esajere', 'PERSON'), ('Federalism Tops Talks As Pan-Niger Delta Summit Begins', 'WORK_OF_ART'), ('Nigerian', 'NORP'), ('Guardian', 'ORG'), ('20 October', 'DATE'), ('100 per cent', 'MONEY'), ('yesterday', 'DATE'), ('first', 'ORDINAL'), ('Pan-Niger Delta', 'ORG'), ('Calabar', 'GPE'), ('Expectedly', 'ORDINAL'), ('University of Calabar', 'ORG'), ('100 per cent', 'MONEY'), ('six', 'CARDINAL'), ('the First Republic', 'GPE'), ('Ethnic Nationality Forum', 'ORG'), ('the Niger Delta Area', 'LOC'), ('David Dafinone', 'PERSON'), ('100 per cent', 'MONEY'), ('Cross River', 'LOC'), ('Donald Duke', 'PERSON'), ('Tony Momoh', 'PERSON'), ('Bayelsa Traditional Rulers', 'ORG'), ('Raph Iwowari Mein VII', 'PERSON'), ('Margaret Ekpo', 'PERSON'), ('OMPADEC', 'ORG'), ('Albert Horsfall', 'PERSON'), ('Federal House of Representative', 'ORG'), ('Nduese Essien', 'PERSON'), ('ECOMOG', 'ORG'), ('Maj-Gen Felix Mujakper', 'PERSON'), ('First', 'ORDINAL'), ('Mfio A

'Nigeria: Pan-Niger Delta summit calls for full control of resources\u2028\u2028Excerpt from article by Akpa Esajere entitled: "Federalism Tops Talks As Pan-Niger Delta Summit Begins" published by Nigerian newspaper The Guardian web site on 20 October\nCalls for a competitive federalism where federating units will have 100 per cent control of resources in their domain and pay tax to the central government formed the thrust of opinions yesterday as the first Pan-Niger Delta conference of Ethnic Nationalities got under way in Calabar.\nExpectedly, speakers at the conference taking place in the Chinua Achebe Arts theatre, University of Calabar, also harped on familiar themes, notably restructuring and resource control.\nThe participants canvassed 100 per cent control of resources.\nImportantly, they want the country restructured into six zones and made functional like the regions of the First Republic.\nPresident of Ethnic Nationality Forum for the Niger Delta Area Chief David Dafinone in