# Explore **spacy** package
Felix Zaussinger | 06.01.2021

## Core Analysis Goal(s)
1. Discover core functionalities of spacy library
2. Test pre-processing chains on example documents from BBC Monitoring
3. Test simple models

## Key Insight(s)
1.
2.
3.

In [1]:
import os
import sys
import logging
from pathlib import Path

import numpy as np
import scipy as sp
import statsmodels.api as sm
from statsmodels.formula.api import ols

%load_ext autoreload
%autoreload 2

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import seaborn as sns
sns.set_context("poster")
sns.set(rc={'figure.figsize': (16, 9.)})
sns.set_style("ticks")

import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)

logging.basicConfig(level=logging.INFO, stream=sys.stdout)

Define directory structure

In [2]:
# project directory
abspath = os.path.abspath('')
project_dir = str(Path(abspath).parents[0])

# sub-directories
data_raw = os.path.join(project_dir, "data", "raw")
data_interim = os.path.join(project_dir, "data", "interim")
data_processed = os.path.join(project_dir, "data", "processed")
figure_dir = os.path.join(project_dir, "plots")

Code ...

In [3]:
import glob
import os

dp_string = os.path.join(data_raw, "BBC_2007_07_04_TXT", "*.txt")
file_list = glob.glob(dp_string)

file_list = sorted(file_list)

In [4]:
n = 10
corpus = []
for file_path in file_list[:n]:
    with open(file_path) as f_input:
        corpus.append(f_input.read())

In [5]:
import spacy

# init language object
# nlp = spacy.load("en_core_web_sm")
nlp = spacy.load('en_core_web_lg')

In [6]:
# processed DOC
doc = nlp(corpus[0])

In [9]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)
    print(token.has_vector, token.vector_norm, token.is_oov)

Nigeria Nigeria PROPN NNP ROOT Xxxxx True False
True 6.989754 False
: : PUNCT : punct : False False
True 5.474056 False
Pan Pan PROPN NNP compound Xxx True False
True 7.387207 False
- - PROPN NNP punct - False False
True 5.6033444 False
Niger Niger PROPN NNP compound Xxxxx True False
True 7.3060813 False
Delta Delta PROPN NNP compound Xxxxx True False
True 6.704845 False
summit summit NOUN NN nsubj xxxx True False
True 7.074887 False
calls call VERB VBZ ROOT xxxx True False
True 5.695754 False
for for ADP IN prep xxx True True
True 4.8435082 False
full full ADJ JJ amod xxxx True True
True 5.2046375 False
control control NOUN NN pobj xxxx True False
True 6.4051557 False
of of ADP IN prep xx True True
True 4.97793 False
resources resource NOUN NNS nmod xxxx True False
True 6.2951474 False
      SPACE _SP     False False
False 0.0 True
Excerpt Excerpt PROPN NNP pobj Xxxxx True False
True 5.3811 False
from from ADP IN prep xxxx True True
True 5.093107 False
article article NOUN NN pobj xxx

In [8]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Nigeria 0 7 GPE
Pan-Niger Delta 9 24 NORP
Akpa Esajere 93 105 PERSON
Federalism Tops Talks 117 138 WORK_OF_ART
Pan-Niger 142 151 NORP
Nigerian 186 194 NORP
Guardian 209 217 ORG
20 October 230 240 DATE
100 per cent 309 321 MONEY
yesterday 427 436 DATE
first 444 449 ORDINAL
Pan-Niger Delta 450 465 ORG
Calabar 518 525 GPE
Expectedly 527 537 ORDINAL
University of Calabar 614 635 ORG
100 per cent 740 752 MONEY
six 828 831 CARDINAL
the First Republic 878 896 GPE
Ethnic Nationality Forum 911 935 ORG
the Niger Delta Area 940 960 LOC
David Dafinone 967 981 PERSON
100 per cent 1078 1090 MONEY
Cross River 1561 1572 LOC
Donald Duke 1582 1593 PERSON
Tony Momoh 1623 1633 PERSON
Bayelsa Traditional Rulers 1647 1673 ORG
Raph Iwowari Mein VII 1689 1710 PERSON
Margaret Ekpo 1746 1759 PERSON
OMPADEC 1780 1787 ORG
Albert Horsfall 1794 1809 PERSON
Federal House of Representative 1819 1850 ORG
Nduese Essien 1857 1870 PERSON
ECOMOG 1879 1885 ORG
Maj-Gen Felix Mujakper 1896 1918 PERSON
First 1920 1925 ORDINAL

In [10]:
texts = [
    "Net income was $9.4 million compared to the prior year of $2.7 million.",
    "Revenue exceeded twelve billion dollars, with a loss of $1b.",
]

nlp = spacy.load("en_core_web_sm")
for doc in nlp.pipe(texts, disable=["tagger", "parser"]):
    # Do something with the doc here
    print([(ent.text, ent.label_) for ent in doc.ents])

[('$9.4 million', 'MONEY'), ('the prior year', 'DATE'), ('$2.7 million', 'MONEY')]
[('twelve billion dollars', 'MONEY'), ('1b', 'MONEY')]
