# Web Scraper for Wikipedia Cities

In [None]:
#Import Libraries
import requests
import bs4
import lxml

#Create List of Links to City Tables
res = requests.get("https://en.wikipedia.org/wiki/List_of_towns_and_cities_with_100,000_or_more_inhabitants")
soup = bs4.BeautifulSoup(res.text, "lxml")

a_s = soup.findAll('a')


pages = []

for a in a_s:
    href = a.get('href')
    if 'List_of_towns_and_cities_with_100,000_or_more_inhabitants/country:_' in str(href) and href not in pages:
        pages.append(href)

#Create Separate Lists of City Names and Links to Cities
cities = []
city_names = []

for page in pages:
    link = 'https://en.wikipedia.org' + page

    res = requests.get(link)
    soup = bs4.BeautifulSoup(res.text, "lxml")
    tables = soup.findAll('table')
    
    for table in tables:

        tbody = table.find('tbody')
        rows = tbody.findAll('tr')

        for row in rows[1:]:
            cell = row.find('td')
            a = cell.find('a')
            cities.append(a.get('href'))
            city_names.append(a.get('title'))

#Scrape All Text From Each City Page to List
city_text = []

for city in cities:
    link = 'https://en.wikipedia.org' + city

    res = requests.get(link)
    soup = bs4.BeautifulSoup(res.text, "lxml")
    body = soup.find('body')
    
    all_text = ''
    for p in body.findAll('p'):
        all_text += p.text
        all_text = all_text.replace('\n', '')
        
        chars_to_replace = '''!()-[]{};:'"\,<>./?@#$%^&*_~0123456789'''
 
        for char in chars_to_replace:
            all_text = all_text.replace(char, "")
            all_text = all_text.lower()
        
    city_text.append(all_text)

In [None]:
#Save scraped files
import pickle

with open('city_names', 'wb') as fp:
    pickle.dump(city_names, fp)

with open('city_text', 'wb') as fp:
    pickle.dump(city_text, fp)

# Parsimonious Language Model from scratch

In [1]:
#Import Libraries and Scraped City Names and Wikipedia Page Text
import pandas as pd
import itertools
from decimal import Decimal

with open('city_names', 'rb') as fp:
    city_names = pickle.load(fp)
    
with open('city_text', 'rb') as fp:
    city_text = pickle.load(fp)

In [None]:
#Find indices for UK cities over 100,000 population
print(city_names.index("Basildon"))
print(city_names.index("Swansea"))

In [2]:
#Test that indices for names and text match
print(city_text[3849][:20])
print(city_text[3921][:20])

basildon ˈbæzɪldən b
swansea ˈswɒnzi wels


In [3]:
#Create document collection
uk_city_names = city_names[3849:3922]
uk_city_text = city_text[3849:3922]

In [None]:
#Create Collection Model
def collection_probabilities(collection):
        
    collection = ' '.join(collection)
    all_terms = set(collection.split())
    
    term_probs = []
    for term in all_terms:
        term_probs.append(collection.count(term) / len(collection))
    
    return all_terms, term_probs

In [None]:
#Assign collection terms and collection term probabilities
all_terms, term_probs = collection_probabilities(uk_city_text)

In [None]:
#Save collection terms and collection term probabilities
with open('all_terms', 'wb') as fp:
    pickle.dump(all_terms, fp)

with open('term_probs', 'wb') as fp:
    pickle.dump(term_probs, fp)

In [None]:
#Open collection terms and collection term probabilities
with open('all_terms', 'rb') as fp:
    city_names = pickle.load(fp)
    
with open('term_probs', 'rb') as fp:
    city_text = pickle.load(fp)

In [None]:
#Create dataframe for collection terms and collection term probabilities
col_prob_table = pd.DataFrame(all_terms, columns=['collection terms'])
col_prob_table['collection probability'] = term_probs

In [None]:
#Save dataframe
col_prob_table.to_pickle("col_prob_table.pkl")

In [None]:
#Open dataframe
col_prob_table = pd.read_pickle("col_prob_table.pkl")

In [None]:
#Apply Expectation Maximization Algorithm
def expectation_maximization(document, all_terms, term_probs):
    lam = 0.1
    iterations = 10
    
    doc_probs = []
    for term in all_terms:
        doc_probs.append(document.count(term) / len(document))

    #E-step
    while iterations != 0:
        e_t_values = []
        for i in range(len(all_terms)):
            e_t_values.append((doc_probs[i] * len(document)) * ((lam * doc_probs[i]) / 
                                                                ((1 - lam) * term_probs[i] + lam * doc_probs[i])))
    #M-step
        doc_probs = [(e_t / sum(e_t_values)) for e_t in e_t_values]
        iterations -= 1
        
    for i in range(len(doc_probs)):
        if doc_probs[i] < 0.0001:
            doc_probs[i] = 0
                
    
    return doc_probs

In [None]:
#Run the EM function to index the document models parsimoniously
for i in range(0, 73):
    document = uk_city_text[i]
    city_name = uk_city_names[i]
    doc_probs = expectation_maximization(document, all_terms, term_probs)
    
    col_prob_table[city_name] = doc_probs

In [None]:
prob_table = col_prob_table.sort_values('collection probability', ascending=False)
prob_table[:10]

In [None]:
#Set the index to collection terms to make it searchable
prob_table = prob_table.set_index('collection terms')

In [None]:
#Save dataframe
prob_table.to_pickle("prob_table.pkl")

In [4]:
#Open dataframe
prob_table = pd.read_pickle("prob_table.pkl")

In [5]:
prob_table = prob_table.sort_values('Basildon', ascending=False)
prob_table[:10]

Unnamed: 0_level_0,collection probability,Basildon,Birmingham,Blackburn,Blackpool,Bolton,"Bournemouth, Christchurch and Poole",Bradford,Brighton and Hove,Bristol,...,"Worcester, England",York,Belfast,Aberdeen,Dundee,Edinburgh,Glasgow,Cardiff,"Newport, Wales",Swansea
collection terms,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
basil,3.1e-05,0.127294,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
basildon,2.8e-05,0.126867,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
bas,0.000287,0.063447,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pitsea,6e-06,0.028275,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pits,9e-06,0.026937,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
don,0.000636,0.022927,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
laindon,3e-06,0.015551,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
vange,7e-06,0.009567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
essex,3.3e-05,0.00892,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
billericay,2e-06,0.008483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
#Search Results Using Unigram Model for Ranking Travel Destinations
def parsimonious_search(probability_index_model, city_names, num_results):
    query = input("Search: ").lower()
    q_list = query.split()
    
    #Score calculator
    lam = 0.9
    city_score = []
    for city in city_names[:-1]:
        p = 1
        #Unigram model with smoothing
        for i in q_list:
            p *= (1-lam) * prob_table.loc[i]['collection probability'] + lam * prob_table.loc[i][city]

        city_score.append(p)
    
    
    #Store and sort results
    results = dict(zip(city_names[:-1], city_score))
    results_sorted = dict(reversed(sorted(results.items(), key=lambda item: item[1])))
    
    #Print specified number of results
    results_to_show = dict(itertools.islice(results_sorted.items(),num_results))
    print('You should try visiting:')
    for k, v in results_to_show.items():
        print(f'{k}, it scored {Decimal(v):.2E}')

In [7]:
parsimonious_search(prob_table, uk_city_names, 5)

Search: Seaside town with a castle
You should try visiting:
Newcastle upon Tyne, it scored 5.49E-18
Blackpool, it scored 5.02E-18
Bournemouth, Christchurch and Poole, it scored 2.41E-18
Southend-on-Sea, it scored 9.77E-19
Newport, Wales, it scored 1.47E-21
