In [1]:
!pip install natasha
!pip install navec
!pip install ipymarkup
!pip install seaborn
!pip install geopy
!pip install branca
!pip install jinja2
!pip install requests
!pip install folium
!pip install intervaltree

import pandas as pd
import string
import os
import sys
import nltk



In [2]:
from natasha import (
    Segmenter,
    MorphVocab,   
    NewsEmbedding,
    NewsMorphTagger,
    NewsSyntaxParser,
    NewsNERTagger, 
    PER,
    NamesExtractor,
    AddrExtractor,
    Doc
)

segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)
ner_tagger = NewsNERTagger(emb)
names_extractor = NamesExtractor(morph_vocab)
addr_extractor = AddrExtractor(morph_vocab)

with open('master_marg.txt') as f:
    contents = f.read()
    f.close()
doc = Doc(contents)
# divides doc into tokens and sents, given start and stop properties
doc.segment(segmenter)
# every token is morphologically tagged, given pos and feats properties
doc.tag_morph(morph_tagger)
# named entity recognition
doc.tag_ner(ner_tagger)

In [3]:
import folium
from folium import plugins
from folium.plugins import MarkerCluster
from folium.plugins import HeatMap
from collections import (
    ChainMap,
    Counter,
    OrderedDict,
    UserDict,
    UserList,
    UserString,
    defaultdict,
    deque,
    namedtuple
)

# nouns = []
# adjs = []
# for token in doc.tokens:
#     if token.pos == 'NOUN':
#     if token.pos == 'ADJ':
#         token.lemmatize(morph_vocab)
#         adjs.append(token.lemma)
# noun_count = Counter(nouns)
# adj_count = Counter(adjs)

people = []
for span in doc.spans:
    if span.type == 'PER':
        span.normalize(morph_vocab)
        span.extract_fact(names_extractor)
        people.append(span.normal)
person_count = Counter(people)

df = pd.DataFrame(person_count.most_common(), columns = ['person', 'count'])
df[:50]

Unnamed: 0,person,count
0,Маргарита,481
1,Иван,250
2,Воланд,229
3,Коровьев,191
4,Пилат,130
5,Азазелло,124
6,Берлиоз,115
7,Никанор Иванович,107
8,Варенуха,90
9,Степа,76


In [9]:
import ipymarkup
from ipymarkup import *
from ipymarkup.palette import *

# produces ipymarkup for the doc, focusing only on spans under the NER tagging scheme
def getMarkup(doc):
    text  = doc.text
    spans = []
    for i, span in enumerate(doc.spans):
        span.normalize(morph_vocab)
        span.extract_fact(names_extractor)
        spans.append((span.start,span.stop,span.type))
    show_span_box_markup(text, spans, palette = palette({'PER':'blue','LOC':'purple', 'ORG':'orange'}))

# getMarkup(doc)
dir(ipymarkup.palette)

['BLUE',
 'BROWN',
 'Color',
 'GREEN',
 'GREY',
 'MaterialRgb',
 'ORANGE',
 'PALETTE',
 'PURPLE',
 'Palette',
 'RED',
 'Record',
 'Rgb',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'material',
 'palette',
 'prepare_color',
 're']

In [27]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import operator
from geopy.geocoders import Nominatim
from collections import UserList
import sys

# Nominatim geocoder for OpenStreetMap data
geolocator = Nominatim(user_agent = 'data_viz')

# set number of locations to be analyzed (sorted by freq)
num_loc = 30

locations = []
for span in doc.spans:
    if span.type == 'LOC':
        span.normalize(morph_vocab)
        span.extract_fact(addr_extractor)
        locations.append(span.normal)
loc_count = Counter(locations).most_common(num_loc)
# wraps list in order to add more functionality
sorted_locations = UserList(loc_count)

# @Emma: using loc_count instead of sorted_locations below causes errors - any idea why?

addresses = []
coordinates = []
for k, v in sorted_locations:
    # geocode the string value of 'location' from df1
    location = geolocator.geocode(k, language = 'en')
    # create tuples and append them to the list
    try:
        coordinates.append((location.latitude, location.longitude))
        addresses.append(location.address)
    except:
#         pass
        coordinates.append((0, 0))
        addresses.append(0)

df1 = pd.DataFrame(sorted_locations, columns = ['location', 'freq'])
df2 = pd.DataFrame(coordinates, columns = ['latitude', 'longitude'])
df3 = pd.DataFrame(addresses, columns = ['address'])

# joins the two dataframes horizontally by setting axis = 1
table = pd.concat([df1, df2, df3], axis=1)
table.index = table.index + 1

In [28]:
table = table[['location', 'freq', 'latitude', 'longitude', 'address']]
table

Unnamed: 0,location,freq,latitude,longitude,address
1,Москва,97,55.750446,37.617494,"Moscow, Central Federal District, Russia"
2,Ершалаим,39,0.0,0.0,0
3,Варьете,30,47.911187,106.882857,"Голомт банк Варьете тооцооны төв, Замчдын гуда..."
4,Ялта,29,44.497071,34.158687,"Yalta, Yalta city municipality, Republic of Cr..."
5,Садовая,22,59.927603,30.319382,"Садовая, Сенная площадь, Сенной округ, Saint P..."
6,Патриаршие пруды,17,55.764348,37.591875,"Patriarch Ponds, 48, Presnensky District, Mosc..."
7,Грибоедов,15,40.113191,44.271,"Griboyedov, Vagharshapat region, Armavir Provi..."
8,Кириаф,13,0.0,0.0,0
9,Патриарших,12,55.764583,37.59537,"Немецкая стоматология на Патриарших, Bolshoy K..."
10,Иудея,11,31.947164,35.381571,"Judea and Samaria, Palestinian Territory"


In [14]:
map1 = folium.Map(location = [55.750446, 37.617494], zoom_start = 6)

table.apply(lambda row:folium.CircleMarker(
    location = [row['latitude'], row['longitude']],
    radius = [row['freq']],
    popup = row['location']
).add_to(map1), axis = 1)

map1

In [8]:
map2 = folium.Map(location = [55.750446, 37.617494], zoom_start = 5)

# ensures you are providing float inputs
table['latitude'] = table['latitude'].astype(float)
table['longitude'] = table['longitude'].astype(float)

# Filter the DF for rows, then columns
# Reducing data size so it runs faster ??
heat_df = table[table['freq'] != 0] 
heat_df = heat_df[['latitude', 'longitude']]
heat_df = heat_df.dropna(axis = 0, subset = ['latitude','longitude'])

# List comprehension to make out list of lists
heat_data = [[row['latitude'], row['longitude']] for index, row in heat_df.iterrows()]
HeatMap(heat_data).add_to(map2)

map2