In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen

import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer 
from nameparser import HumanName
import spacy

from geotext import GeoText
import geonamescache

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import folium
import polyline

from collections import Counter
import regex

from IPython.core.display import display, HTML

In [2]:
display(HTML("<style>.container { width:90% !important; }</style>"))

In [3]:
eng_stopwords = stopwords.words("english")
gc = geonamescache.GeonamesCache()
nltk.download('punkt')
nlp = spacy.load("en_core_web_sm")
eng_stopwords = stopwords.words("english")
Tokenizer = RegexpTokenizer(r"[A-Za-z0-9-]+")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dariu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Around the world in 80 days

In [4]:
url = 'https://www.gutenberg.org/files/103/103-0.txt'

# Scrapping website to get text divided into chapters

In [5]:
def get_request(url):
    """
    Gets html text from url
    
    Input:
        url: string - website url
    
    Output:
        soup: bs4.BeautifulSoup - bs4 object with html text
    """
    request = urlopen(url)
    soup = BeautifulSoup(request.read(), 'lxml')
    
    return soup

In [6]:
def divide_text_for_chapters(soup):
    """
    Finds all chapters of a book
    
    Input:
        soup: bs4.BeautifulSoup - bs4 object with html text
        
    Output:
        chapters: list - list of all chapters from a book Around the World in 80 days
    """
    chapters = soup.text.split('CHAPTER')[38:]
    chapters[-1] = chapters[-1].split('***')[0]
    
    for i, chapter in enumerate(chapters):
        chapters[i] = chapter.replace('\r', '').replace('\n', ' ').split('   ', 1)[1].replace('No. ', '')
        
    return chapters

In [7]:
chapters = divide_text_for_chapters(get_request(url))

## Number of chapters in "Around the world in 80 days"

In [8]:
len(chapters)

37

## Extracting words from chapters

In [9]:
def analyzy_syntax_spacy(text):
    """
    Finds words, which are classified as Geopolitical Entity and Locations by spacy library
    
    Input:
        text: str - sentence in which entities are searched
        
    Output:
        results: list - list of words classified as GPE or LOC
    """
    results = []
    doc = nlp(text)
    for entity in doc.ents:
        if entity.label_ == 'GPE' or entity.label_ == 'LOC':
            results.append(entity.text)
    
    if results == []:
        return None
    
    return results

In [10]:
def create_cities(lista, names):
    """
    Finds cities using GeoText library
    
    Input:
        text: str - sentence in which cities are looking for
        names: list - list of scrapped names by nltk library to exclude misclassifications
        
    Output:
        results: list - list of words classified as a city
    """
    res = GeoText(lista).cities 
    if len(res) > 0:
        res = [notname for notname in res if notname not in names]
        if len(res) == 0:
            return None
        else:
            return res
    else:
        return None

In [11]:
def get_human_names(text):
    """
    Fidns all names and surnames in a book using nltk library
    
    Input:
        text: str - a single chapter of a book "Around the world in 80 days"
        
    Output:
        person_list: list - list of words classified as a city    
    """
    tokens = nltk.tokenize.word_tokenize(text)
    pos = nltk.pos_tag(tokens)
    sentt = nltk.ne_chunk(pos, binary = False)
    person_list = []
    person = []
    name = ""
    for subtree in sentt.subtrees(filter=lambda t: t.label() == 'PERSON'):
        for leaf in subtree.leaves():
            person.append(leaf[0])
        if len(person) > 1:
            for part in person:
                name += part + ' '
            if name[:-1] not in person_list:
                person_list.append(name[:-1])
            name = ''
        person = []

    return person_list

In [12]:
def filter_human_names(chapters):
    """
    Creates lists of all names and surnames in a book using nltk library
    
    Input:
        chapters: list - list of all chapters from a book Around the World in 80 days
        
    Output:
        people_list: list - list of all people present in a book
    """
    list_of_names = []
    for chapter in chapters:
        names = get_human_names(chapter)
        for name in names: 
            list_of_names.extend([HumanName(name).last, HumanName(name).first])
             
    people_list = Counter(list_of_names).most_common()
    people_list = [i[0] for i in people_list if i[1] > 2]
    
    return people_list

### Finds all human names to be excluded later

In [13]:
people_list = filter_human_names(chapters)

## Creating dataframe

In [14]:
def create_dataframe(chapters):
    """
    Creates dataframe, where every row is represented by a single sentence. There are information about number of chapter for every sentence and order of sentences inside one chapter.
    Additionaly, there are names of cities, which were found using two different functions analyzy_syntax_spacy and create_cities.
    
    Input:
        chapters: list - list of all chapters from a book Around the World in 80 days
        
    Output:
        df: pandas.DataFrame - a dataframe with all sentences in a book
    """
    
    df = pd.DataFrame({'Chapter': [], 'Sentence index': [], 'Sentence': []})
    for i, chapter in enumerate(chapters):
        a_list = nltk.tokenize.sent_tokenize(chapter)
        df = df.append(pd.DataFrame({'Chapter': [i+1]*len(a_list), 'Sentence index': range(1, len(a_list)+1), 'Sentence': a_list}), ignore_index=True)

    df['Chapter'] = df['Chapter'].astype(int)
    df['Sentence index'] = df['Sentence index'].astype(int)    
    
    
    df['Cities1'] = df.apply(lambda row: create_cities(row.Sentence, people_list), axis=1)
    df['Cities2'] = df.apply(lambda row: analyzy_syntax_spacy(row.Sentence), axis=1)
    
    return df

In [15]:
df = create_dataframe(chapters)

In [16]:
df

Unnamed: 0,Chapter,Sentence index,Sentence,Cities1,Cities2
0,1,1,"Mr. Phileas Fogg lived, in 1872, at 7, Saville...",,
1,1,2,He was one of the most noticeable members of t...,,
2,1,3,People said that he resembled Byron—at least t...,,
3,1,4,"Certainly an Englishman, it was more doubtful ...",,
4,1,5,"He was never seen on ’Change, nor at the Bank,...","[Temple, Lincoln]",[London]
...,...,...,...,...,...
2799,37,48,What had he really gained by all this trouble?,,
2800,37,49,What had he brought back from this long and we...,,
2801,37,50,"Nothing, say you?",,
2802,37,51,"Perhaps so; nothing but a charming woman, who,...",,


## Dropping rows where Cities1 and Cities2 are null at the same time

In [17]:
def drop_nulls(df):
    return df[~df['Cities1'].isnull() | ~df['Cities2'].isnull()]

In [18]:
df = drop_nulls(df)

In [19]:
df

Unnamed: 0,Chapter,Sentence index,Sentence,Cities1,Cities2
4,1,5,"He was never seen on ’Change, nor at the Bank,...","[Temple, Lincoln]",[London]
23,1,24,It was at least certain that Phileas Fogg had ...,[London],[London]
30,1,31,"He lived alone in his house in Saville Row, wh...",,[Saville Row]
34,1,35,He passed ten hours out of the twenty-four in ...,,[Saville Row]
38,1,39,"The mansion in Saville Row, though not sumptuo...",,[Saville Row]
...,...,...,...,...,...
2777,37,26,"In other words, while Phileas Fogg, going east...",[London],[London]
2779,37,28,"And Passepartout’s famous family watch, which ...",[London],[London]
2787,37,36,"If you had not suggested our marriage, my serv...",[Wilson],
2793,37,42,"Why, I’ve just this instant found out—” “What...",,[India]


## Exploding every cities from one row to seperate entries

In [20]:
def explode_rows(df):
    df = df.explode('Cities1')
    df = df.explode('Cities2')
    df = df.reset_index()
    
    return df

In [21]:
df = explode_rows(df)

In [22]:
df

Unnamed: 0,index,Chapter,Sentence index,Sentence,Cities1,Cities2
0,4,1,5,"He was never seen on ’Change, nor at the Bank,...",Temple,London
1,4,1,5,"He was never seen on ’Change, nor at the Bank,...",Lincoln,London
2,23,1,24,It was at least certain that Phileas Fogg had ...,London,London
3,30,1,31,"He lived alone in his house in Saville Row, wh...",,Saville Row
4,34,1,35,He passed ten hours out of the twenty-four in ...,,Saville Row
...,...,...,...,...,...,...
1444,2777,37,26,"In other words, while Phileas Fogg, going east...",London,London
1445,2779,37,28,"And Passepartout’s famous family watch, which ...",London,London
1446,2787,37,36,"If you had not suggested our marriage, my serv...",Wilson,
1447,2793,37,42,"Why, I’ve just this instant found out—” “What...",,India


## Using GeonamesCache() let's check if values from Cities1 and Cities2 are real cities

In [23]:
def validate_city(city):
    return gc.search_cities(city, case_sensitive=False)

In [24]:
def find_real_cities(df):
    real_cities = []
    weirdos = []
    for city in list(set(df['Cities1']).union(set(df['Cities2']))):
        if city != None:           
            res = validate_city(city)
        if len(res) == 0:
            weirdos.append(city)
        else:
            real_cities.append((res[0]['latitude'], res[0]['longitude']))
            
    return real_cities, weirdos

In [25]:
real_cities, weirdos = find_real_cities(df)

## Deleting non-cities objects from dataframe

In [26]:
for i in range(len(df)):
    if df['Cities1'].iloc[i] in weirdos:
        df.at[i, 'Cities1'] = None
    if df['Cities2'].iloc[i] in weirdos:
        df.at[i, 'Cities2'] = None
        
df = df[~df['Cities1'].isnull() | ~df['Cities2'].isnull()]

In [27]:
df

Unnamed: 0,index,Chapter,Sentence index,Sentence,Cities1,Cities2
0,4,1,5,"He was never seen on ’Change, nor at the Bank,...",Temple,London
1,4,1,5,"He was never seen on ’Change, nor at the Bank,...",Lincoln,London
2,23,1,24,It was at least certain that Phileas Fogg had ...,London,London
6,49,1,50,"Then I got to be a professor of gymnastics, so...",Paris,Paris
12,62,2,1,"“Faith,” muttered Passepartout, somewhat flurr...",London,London
...,...,...,...,...,...,...
1444,2777,37,26,"In other words, while Phileas Fogg, going east...",London,London
1445,2779,37,28,"And Passepartout’s famous family watch, which ...",London,London
1446,2787,37,36,"If you had not suggested our marriage, my serv...",Wilson,
1447,2793,37,42,"Why, I’ve just this instant found out—” “What...",,India


## Function to create a map

In [28]:
def get_map(cities, polylines=False):
    m = folium.Map(location=[0, 0], zoom_start=2)

    if polylines:
        folium.PolyLine(
            cities,
            weight=8,
            color='blue',
            opacity=0.6
        ).add_to(m)

    for i in range(len(cities)):
        folium.Marker(
            location=[cities[i][0], cities[i][1]],
            icon=folium.Icon(color="orange",icon="fas fa-bolt", prefix='fa')
        ).add_to(m)

    return m

## Let's take a look for a map with cities, which we obtain

In [29]:
get_map(real_cities)

## Let's take a look at specific words in sentences in which cities appeared

In [30]:
def cleaning_comments(comments):
    cleaned_comments = []
    ps = PorterStemmer()
    wl=WordNetLemmatizer()
    for comment in comments:
        comment = comment.lower()
        comment = Tokenizer.tokenize(comment)
        comment = [word for word in comment if word not in eng_stopwords]
        comment = [word for word in comment if len(word)>2] 
        comment = [word for word in comment if not word.isnumeric()]
        comment = [wl.lemmatize(i) for i in comment]
        comment = [ps.stem(i) for i in comment]

        cleaned_comments.append(comment)

    cleaned_comments_combined = [word for sublist in cleaned_comments for word in sublist]
    
    return cleaned_comments_combined

## Creating new column with lemmatized and stemmed words

In [31]:
df['Cleaning'] = df.apply(lambda row: cleaning_comments([row.Sentence]), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [32]:
df

Unnamed: 0,index,Chapter,Sentence index,Sentence,Cities1,Cities2,Cleaning
0,4,1,5,"He was never seen on ’Change, nor at the Bank,...",Temple,London,"[never, seen, chang, bank, counting-room, citi..."
1,4,1,5,"He was never seen on ’Change, nor at the Bank,...",Lincoln,London,"[never, seen, chang, bank, counting-room, citi..."
2,23,1,24,It was at least certain that Phileas Fogg had ...,London,London,"[least, certain, philea, fogg, absent, london,..."
6,49,1,50,"Then I got to be a professor of gymnastics, so...",Paris,Paris,"[got, professor, gymnast, make, better, use, t..."
12,62,2,1,"“Faith,” muttered Passepartout, somewhat flurr...",London,London,"[faith, mutter, passepartout, somewhat, flurri..."
...,...,...,...,...,...,...,...
1444,2777,37,26,"In other words, while Phileas Fogg, going east...",London,London,"[word, philea, fogg, go, eastward, saw, sun, p..."
1445,2779,37,28,"And Passepartout’s famous family watch, which ...",London,London,"[passepartout, famou, famili, watch, alway, ke..."
1446,2787,37,36,"If you had not suggested our marriage, my serv...",Wilson,,"[suggest, marriag, servant, would, gone, rever..."
1447,2793,37,42,"Why, I’ve just this instant found out—” “What...",,India,"[instant, found, might, made, tour, world, sev..."


## Words connected to travelling

In [33]:
synonyms = ['arrive', 'travel', 'depart', 'leave', 'movement', 'move', 'trip', 
            'navigation', 'ride', 'drive', 'embark',
            'walk', 'disembark', 'reach', 'stop']

synonyms = cleaning_comments(synonyms)

In [34]:
def find_words(lista):
    res = [word for word in lista if word in synonyms]  
    if res == []:
        return None
    else:
        return set(res)

In [35]:
df['Words'] = df.apply(lambda row: find_words(row.Cleaning), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [36]:
df[~df['Words'].isnull()]

Unnamed: 0,index,Chapter,Sentence index,Sentence,Cities1,Cities2,Cleaning,Words
48,131,3,28,Detectives were also charged with narrowly wat...,London,London,"[detect, also, charg, narrowli, watch, arriv, ...",{arriv}
175,247,5,14,"He might, perhaps, reckon on the arrival of tr...",,India,"[might, perhap, reckon, arriv, train, design, ...",{arriv}
205,274,6,7,"This was Fix, one of the detectives who had be...",Suez,,"[fix, one, detect, dispatch, england, search, ...",{arriv}
206,274,6,7,"This was Fix, one of the detectives who had be...",Suez,London,"[fix, one, detect, dispatch, england, search, ...",{arriv}
207,274,6,7,"This was Fix, one of the detectives who had be...",London,,"[fix, one, detect, dispatch, england, search, ...",{arriv}
...,...,...,...,...,...,...,...,...
1438,2722,36,25,When the clock indicated twenty minutes past e...,Liverpool,Liverpool,"[clock, indic, twenti, minut, past, eight, and...",{arriv}
1439,2729,36,32,"You know, besides, that the ‘China’—the only s...",New York,New York,"[know, besid, china, steamer, could, taken, ne...",{arriv}
1441,2753,37,2,The reader will remember that at five minutes ...,London,London,"[reader, rememb, five, minut, past, eight, eve...","{travel, arriv}"
1442,2753,37,2,The reader will remember that at five minutes ...,Wilson,London,"[reader, rememb, five, minut, past, eight, eve...","{travel, arriv}"


## Removing continents

In [37]:
continents = []
geocache = gc.get_continents()
for continent in geocache:
    continents.append((geocache[continent]['toponymName']))

In [38]:
df = df[~df['Cities1'].isin(continents)]
df = df[~df['Cities2'].isin(continents)]

## Creating ordered list of cities

In [39]:
df = df[df['Chapter'] != 37]

In [40]:
ordered_cities = ['London']
for i in range(len(df[~df['Words'].isnull()]['Cities1'])):
    if df[~df['Words'].isnull()]['Cities1'].iloc[i] not in ordered_cities:
        ordered_cities.append(df[~df['Words'].isnull()]['Cities1'].iloc[i])
    if df[~df['Words'].isnull()]['Cities2'].iloc[i] not in ordered_cities:
        ordered_cities.append(df[~df['Words'].isnull()]['Cities2'].iloc[i])
        
ordered_cities.remove(None)

In [41]:
for dest in ordered_cities:
    alternatenames = gc.get_cities_by_name(dest)
    if len(alternatenames) > 0:
        alternatenames = gc.get_cities_by_name(dest)[0][list(gc.get_cities_by_name(dest)[0].keys())[0]]['alternatenames'] 
    try:
        gc.get_countries_by_names()[dest]
        if len(alternatenames) > 0:
            alternatenames = gc.get_cities_by_name(dest)[0][list(gc.get_cities_by_name(dest)[0].keys())[0]]['alternatenames'] 
        if dest in alternatenames:
            pass
        else:
            ordered_cities.remove(dest)
    except KeyError:
        pass

In [42]:
ordered_cities_2 = []
for city in ordered_cities:
    if ' '.join(chapters).count(city) > 2:
        ordered_cities_2.append(city)
        
ordered_cities = ordered_cities_2

In [43]:
ordered_cities

['London',
 'Suez',
 'Brindisi',
 'Paris',
 'Bombay',
 'Calcutta',
 'Singapore',
 'Hong Kong',
 'Yokohama',
 'San Francisco',
 'New York',
 'Aden',
 'Allahabad',
 'Nagasaki',
 'Shanghai',
 'Union',
 'Omaha',
 'Salt Lake City',
 'Sacramento',
 'Salt Lake',
 'Ogden',
 'Missouri',
 'Kearney',
 'Liverpool',
 'Chicago',
 'Hudson',
 'Queenstown',
 'Dublin']

In [44]:
len(ordered_cities)

28

## 'Missouri' and 'Hudson' are rivers, we need to remove them

In [45]:
rivers = []
for city in ordered_cities:
    values = wn.synsets(city)
    for i in range(len(values)):
        if 'river' in values[i].definition()[:25]:
            rivers.append(city)

for item in rivers:
    ordered_cities.remove(item)

## Adding longitude and latitude by all cities from ordered list

In [46]:
destinations = []
for row in ordered_cities:
    results = gc.search_cities(row, case_sensitive=False)
    if row == 'Queenstown':
        destinations.append([-8.29924, 51.85237, 'Cobh (Queenstown)'])  
    elif len(results) > 1:
        place = results[np.argmax([i['population'] for i in results])]
        destinations.append([place['longitude'], place['latitude'], row])
    else:
        destinations.append([results[0]['longitude'], results[0]['latitude'], row])

In [47]:
destinations

[[-0.12574, 51.50853, 'London'],
 [32.52627, 29.97371, 'Suez'],
 [17.93607, 40.63215, 'Brindisi'],
 [2.3488, 48.85341, 'Paris'],
 [72.88261, 19.07283, 'Bombay'],
 [88.36304, 22.56263, 'Calcutta'],
 [103.85007, 1.28967, 'Singapore'],
 [114.17469, 22.27832, 'Hong Kong'],
 [139.65, 35.43333, 'Yokohama'],
 [-122.41942, 37.77493, 'San Francisco'],
 [-74.00597, 40.71427, 'New York'],
 [45.03667, 12.77944, 'Aden'],
 [81.84322, 25.44478, 'Allahabad'],
 [129.88333, 32.75, 'Nagasaki'],
 [121.45806, 31.22222, 'Shanghai'],
 [-74.2632, 40.6976, 'Union'],
 [-95.94043, 41.25626, 'Omaha'],
 [-111.89105, 40.76078, 'Salt Lake City'],
 [-121.4944, 38.58157, 'Sacramento'],
 [-111.89105, 40.76078, 'Salt Lake'],
 [-111.97383, 41.223, 'Ogden'],
 [-74.14542, 40.76843, 'Kearney'],
 [-2.97794, 53.41058, 'Liverpool'],
 [-87.65005, 41.85003, 'Chicago'],
 [-8.29924, 51.85237, 'Cobh (Queenstown)'],
 [-6.24889, 53.33306, 'Dublin']]

## Sorting all locations and find connections between them

In [48]:
destinations.sort()

In [49]:
to_delete = []
for i in range(len(destinations)):
    distances = []
    for j in range(len(destinations)):
        if i<j:
            if ((destinations[i][0] - destinations[j][0])**2 + (destinations[i][1] - destinations[j][1])**2)**(1/2) < 0.1:
                to_delete.append(destinations[i])
    
    
for i in range(len(to_delete)):
    destinations.remove(to_delete[i])

In [50]:
destinations.sort()

## Final cities and map

In [56]:
destinations

[[-122.41942, 37.77493, 'San Francisco'],
 [-121.4944, 38.58157, 'Sacramento'],
 [-111.97383, 41.223, 'Ogden'],
 [-111.89105, 40.76078, 'Salt Lake City'],
 [-95.94043, 41.25626, 'Omaha'],
 [-87.65005, 41.85003, 'Chicago'],
 [-74.2632, 40.6976, 'Union'],
 [-74.14542, 40.76843, 'Kearney'],
 [-74.00597, 40.71427, 'New York'],
 [-8.29924, 51.85237, 'Cobh (Queenstown)'],
 [-6.24889, 53.33306, 'Dublin'],
 [-2.97794, 53.41058, 'Liverpool'],
 [-0.12574, 51.50853, 'London'],
 [2.3488, 48.85341, 'Paris'],
 [17.93607, 40.63215, 'Brindisi'],
 [32.52627, 29.97371, 'Suez'],
 [45.03667, 12.77944, 'Aden'],
 [72.88261, 19.07283, 'Bombay'],
 [81.84322, 25.44478, 'Allahabad'],
 [88.36304, 22.56263, 'Calcutta'],
 [103.85007, 1.28967, 'Singapore'],
 [114.17469, 22.27832, 'Hong Kong'],
 [121.45806, 31.22222, 'Shanghai'],
 [129.88333, 32.75, 'Nagasaki'],
 [139.65, 35.43333, 'Yokohama']]

In [53]:
get_map([(k[1], k[0]) for k in destinations], polylines=True)

In [54]:
[(k[0], k[1]) for k in destinations]

[(-122.41942, 37.77493),
 (-121.4944, 38.58157),
 (-111.97383, 41.223),
 (-111.89105, 40.76078),
 (-95.94043, 41.25626),
 (-87.65005, 41.85003),
 (-74.2632, 40.6976),
 (-74.14542, 40.76843),
 (-74.00597, 40.71427),
 (-8.29924, 51.85237),
 (-6.24889, 53.33306),
 (-2.97794, 53.41058),
 (-0.12574, 51.50853),
 (2.3488, 48.85341),
 (17.93607, 40.63215),
 (32.52627, 29.97371),
 (45.03667, 12.77944),
 (72.88261, 19.07283),
 (81.84322, 25.44478),
 (88.36304, 22.56263),
 (103.85007, 1.28967),
 (114.17469, 22.27832),
 (121.45806, 31.22222),
 (129.88333, 32.75),
 (139.65, 35.43333)]