In [1]:
# 导入工具包和英文模型
# python -m spacy download en

import spacy
nlp = spacy.load("en_core_web_sm")

2023-06-06 17:42:44.325645: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## 文本处理

In [5]:
doc = nlp('Weather is good, very windy and sunny. We have no classes in the afternoon.')

In [6]:
# 分词
for token in doc:
    print(token)

Weather
is
good
,
very
windy
and
sunny
.
We
have
no
classes
in
the
afternoon
.


In [7]:
# 分句
for sent in doc.sents:
    print(sent)

Weather is good, very windy and sunny.
We have no classes in the afternoon.


## 词性

In [10]:
for token in doc:
    print('{}-{}'.format(token.token_pos_))

AttributeError: 'spacy.tokens.token.Token' object has no attribute 'token_pos_'

## 命名体的识别

In [11]:
doc_2 = nlp("I want to Pairs where I met my old friend Jack from uni.")

In [12]:
from spacy import displacy
doc = nlp('I want to Pairs where I met my old friend Jack uni.')
displacy.render(doc, style='ent', jupyter=True)

## 找到书中所有人物名字

In [None]:
def read_file(filename):
    with open(filename, 'r') as file:
        return file.read()

In [None]:
# 加载文本数据
text = read_file('./data/pride_and_prejudice.txt')
process_text = nlp(text)

In [None]:
sentences = [sent for sent in process_text.sents]
print(len(sentences))

In [None]:
sentences[:5]

In [None]:
from collections import Counter, defaultdict

def find_person(doc):
    c = Counter()
    for ent in process_text.ents:
        if ent.label_ == 'PERSON':
            c[ent.lemma_]+=1
    return c.most_common(10)

print(find_person(process_text))

## 恐怖袭击分析

In [None]:
def read_file_to_list(filename):
    with open(filename, 'r') as file:
        return file.readlines()

In [None]:
terrorism_articles = read_file_to_list('data/rand-terrorism-dataset.txt')

In [None]:
terrorism_articles[:5]

In [None]:
terrorism_articles_nlp = [nlp(art) for art in terrorism_articles]

In [None]:
common_terrorist_groups = [
    'taliban',
    'a1 - qaeda',
    'hamas',
    'fatah',
    'plo',
    'bilad a1 - rafidayn'
]

common_locations = [
    'iraq',
    'baghdad',
    'kirkuk',
    'mosul',
    'afshanistan',
    'kabul',
    'basra',
    'palestine',
    'gaza',
    'israel',
    'istanbul',
    'beirut',
    'pakistan'
]

In [3]:
location_entity_dict = defaultdict(Counter)

for article in terrorism_articles_nlp:
    article_terrorist_groups = [ent.lemma_ for ent in article.ents if ent.label_ == 'PERSON' or ent.label_ == 'ORG'] # 人或者组织
    article_locations = [ent.lemma_ for ent in article.ents if ent.label_ == 'GPE'] # 地点
    terrorist_common = [ent for ent in article_terrorist_groups if ent in common_terrorist_groups]
    locations_common = [ent for ent in article_locations if ent in common_locations]

    for found_entity  in terrorist_common:
        for found_location in locations_common:
            location_entity_dict[found_entity][found_location] += 1

location_entity_dict

SyntaxError: unexpected EOF while parsing (1835749569.py, line 4)

In [None]:
import  pandas as pd
location_entity_df = pd.DataFrame.from_dict(dict(location_entity_dict), dtype=int)
location_entity_df = location_entity_df.fillna(value=0).astype(int)
location_entity_df

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(12, 10))
hmap = sns.heatmap(location_entity_df, annot=True, fmt='d', cmap='Y1GnBu', cbar=False)

# add information
plt.title('Global Incidents by Terrorist group')
plt.xticks(rotation=30)
plt.show()