In [2]:
import pandas as pd
import wikipedia
import re

We are going to scrape and clean the data from a table in Wikipedia.

In [3]:
webpage = "https://en.wikipedia.org/wiki/List_of_sovereign_states"
countries = pd.read_html(webpage)
#print(countries)
countries = countries[0]["Common and formal names"]
#print(countries)
countries.dropna(inplace=True)
#print(countries)
countries = pd.DataFrame({"Country" : [re.split("– | → ", country)[0] for country in countries.values]})
#print(countries)
countries.drop(countries.tail(12).index, inplace=True)
#print(countries)
countries.drop(countries.head(1).index, inplace=True)
#print(countries)
# countries.drop(countries.tail(214).index, inplace=True) # Comment out
print(countries)

                    Country
1                  Abkhazia
2              Afghanistan 
3                  Albania 
4                  Algeria 
5                  Andorra 
6                   Angola 
7       Antigua and Barbuda
8                Argentina 
9                  Armenia 
10                  Artsakh
11               Australia 
12                 Austria 
13              Azerbaijan 
14            Bahamas, The 
15                 Bahrain 
16              Bangladesh 
17                 Barbados
18                 Belarus 
19                 Belgium 
20                   Belize
21                   Benin 
22                  Bhutan 
23                 Bolivia 
24   Bosnia and Herzegovina
25                Botswana 
26                  Brazil 
27                  Brunei 
28                Bulgaria 
29             Burkina Faso
30                    Burma
..                      ...
195                  Syria 
196                  Taiwan
197             Tajikistan 
198               Ta

In [None]:
#ny = wikipedia.page("New York")


try:
    ny = wikipedia.page("New York")
except wikipedia.exceptions.DisambiguationError as e:
    ny = wikipedia.page(e.options[0])

print(ny.content)


In [None]:
# We are going to scrape the text to associate with our countries

texts = []
for country in countries["Country"]:
    try:
        page = wikipedia.page(country)
    except wikipedia.exceptions.DisambiguationError as e:
        page = wikipedia.page(e.options[0])
    texts.append(page.content)
    print(country)


In [None]:
print(texts[1])

In [None]:
# We are going to pre-process our text

from nltk.tokenize import sent_tokenize
import re
from spacy.lang.en.stop_words import STOP_WORDS
import spacy

# What do these do?
pattern = re.compile("[^a-zA-Z]")
stops = STOP_WORDS
nlp = spacy.load("en", disable=["ner", "parser"])


def clean_text(text):
    new_text = []
    sentences = sent_tokenize(text)
    sentences = [" ".join([pattern.sub("", i) for i in sentence.lower().split() if i not in stops]) for sentence in sentences]
    for sentence in sentences:
        doc = nlp(sentence)
        lemmatized = " ".join([token.lemma_ for token in doc])
        new_text.append(lemmatized)
    new_text = " \n ".join([sentence for sentence in new_text])
    return(new_text)


In [None]:
print(clean_text(texts[1]))

In [8]:
new_texts = []
for text in texts:
    new_texts.append(clean_text(text))

In [9]:
countries["Text"] = new_texts


In [10]:
countries

Unnamed: 0,Country,Text
1,Abkhazia,abkhazia listen selfdeclare sovereign stat...
2,Afghanistan,afghanistan listen pashtodari pashto afnis...
3,Albania,albania listen awlbaynee albanian shqipri sh...
4,Algeria,algeria listen aljeeree arabic romanize al...
5,Andorra,andorra listen uk catalan ndor officially ...
6,Angola,angola listen portuguese l officially republ...
7,Antigua and Barbuda,antigua barbuda listen anteegw barbyood co...
8,Argentina,argentina spanish axentina officially argentin...
9,Armenia,armenia listen armenian romanize hayastan ...
10,Artsakh,republic artsakh armenian artsakhi hanrap...


In [11]:
# We are going to construct our vocab

from random import shuffle

vocab = []
for index, row in countries.iterrows():
    for word in row["Text"].split():
        if word not in vocab:
            vocab.append(word)
shuffle(vocab) #Why shuffle?
print(len(vocab))


10221


In [12]:
#We are going to construct our count vectors

from collections import Counter
import numpy as np

vectors = []
for index, row in countries.iterrows(): #What does iterrows do?
    new_counter = Counter()
    new_counter.update({x:0 for x in vocab})
    for word in row["Text"].split():
        new_counter.update({word:1})
    vector = [new_counter[word] for word in vocab]
    vectors.append(vector)

DataMatrix = np.array(vectors)



In [13]:
countries["Counts"] = [DataMatrix[i] for i in range(len(countries))]

In [14]:
countries

Unnamed: 0,Country,Text,Counts
1,Abkhazia,abkhazia listen selfdeclare sovereign stat...,"[0, 0, 0, 0, 0, 0, 11, 0, 0, 0, 5, 0, 0, 0, 0,..."
2,Afghanistan,afghanistan listen pashtodari pashto afnis...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 2, 0, ..."
3,Albania,albania listen awlbaynee albanian shqipri sh...,"[0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 0, ..."
4,Algeria,algeria listen aljeeree arabic romanize al...,"[0, 0, 0, 4, 0, 0, 1, 1, 0, 3, 2, 0, 0, 1, 0, ..."
5,Andorra,andorra listen uk catalan ndor officially ...,"[1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 1, 0, ..."
6,Angola,angola listen portuguese l officially republ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 6, 0, 0, 0, 0, ..."
7,Antigua and Barbuda,antigua barbuda listen anteegw barbyood co...,"[0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, ..."
8,Argentina,argentina spanish axentina officially argentin...,"[0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 1, 0, 0, ..."
9,Armenia,armenia listen armenian romanize hayastan ...,"[0, 0, 0, 0, 4, 0, 4, 0, 1, 0, 3, 1, 0, 0, 1, ..."
10,Artsakh,republic artsakh armenian artsakhi hanrap...,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, 0, ..."
