In [17]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

url = 'http://en.wikipedia.org/wiki/Python_(programming_language)'
html = urlopen(url)
bs = BeautifulSoup(html, 'html.parser')

content = bs.find('div', {'id':'mw-content-text'}).find_all('p')
content = [p.get_text() for p in content]
content = ''.join(content)
print(content)


Python is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation.[32]
Python is dynamically type-checked and garbage-collected. It supports multiple programming paradigms, including structured (particularly procedural), object-oriented and functional programming. It is often described as a "batteries included" language due to its comprehensive standard library.[33][34]
Guido van Rossum began working on Python in the late 1980s as a successor to the ABC programming language and first released it in 1991 as Python 0.9.0.[35] Python 2.0 was released in 2000. Python 3.0, released in 2008, was a major revision not completely backward-compatible with earlier versions. Python 2.7.18, released in 2020, was the last release of Python 2.[36]
Python consistently ranks as one of the most popular programming languages, and has gained widespread use in the machine learning community.[37][38][39][40]
Python was co

In [38]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re 
import string 
import unicodedata

CITATION_REGEX = re.compile('\[[0-9]*\]')
PARENS_REGEX = re.compile('\([a-z A-Z \+\.,\-]{0,100}\)')
DESCRIPTION_REGEX = re.compile('\n[a-z A-Z]*:')

puncts = [re.escape(c) for c in string.punctuation]
PUNCTUATION_REGEX = re.compile('|'.join(puncts))
#PUNCTUATION_REGEX = re.compile('[' + re.escape(''.join(puncts)) + ']')

url = 'http://en.wikipedia.org/wiki/Python_(programming_language)'
html = urlopen(url)
bs = BeautifulSoup(html, 'html.parser')

content = bs.find('div', {'id':'mw-content-text'}).find_all('p')
content = [p.get_text() for p in content]
content = ''.join(content)

def replace_newlines(text):
    return text.replace('\n',' ')

def make_lowercase(text):
    return text.lower()

def split_sentences(text):
    return [s.strip() for s in text.split('. ')]

def strip_citations(text):
    return re.sub(CITATION_REGEX, '', text)

def remove_parentheses(text):
    return re.sub(PARENS_REGEX, '', text)

def remove_descriptions(text):
    return re.sub(DESCRIPTION_REGEX, '', text)

def remove_punctuation(text):
    return re.sub(PUNCTUATION_REGEX, '', text)

def normalize(text):
    return unicodedata.normalize('NFKD', text)

text_operations = [
    strip_citations, 
    remove_parentheses, 
    remove_descriptions, 
    replace_newlines, 
    split_sentences, 
    make_lowercase, 
    remove_punctuation, 
    normalize
]

cleaned = content
for op in text_operations:
    #print(type(cleaned))
    if type(cleaned) == list:
        cleaned = [op(c) for c in cleaned]
    else:
        cleaned = op(cleaned)
    #print(cleaned)
        
print(cleaned)



In [39]:
from collections import Counter

def getNgrams(text, n):
    text = text.split(' ')
    return [' '.join(text[i:i+n]) for i in range(len(text)-n+1)]

def countNgramsFromSentences(sentences, n):
    counts = Counter()
    for sentence in sentences:
        counts.update(getNgrams(sentence, n))
    return counts

counts = countNgramsFromSentences(cleaned, 2)
print(counts.most_common())



In [40]:
import pandas as pd

df = pd.DataFrame([['a', 1], ['b', 2], ['c', 3]])
df.head()

Unnamed: 0,0,1
0,a,1
1,b,2
2,c,3


In [47]:
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import re

html = urlopen('https://en.wikipedia.org/wiki/List_of_countries_with_McDonald%27s_restaurants')
bs = BeautifulSoup(html, 'html.parser')

df = pd.read_csv('countries.csv')
#df.head(10)

df.rename(columns={
    '#': 'Order',
    'Country/territory': 'Country',
    'Date of first store': 'Date',
    'First outlet location': 'Location',
    'Max. no. ofoperatingoutlets': 'Outlets'
}, inplace=True)

df = df[['Order', 'Country', 'Date', 'Location', 'Outlets']]

date_regex = re.compile('[A-Z][a-z]+ [0-9]{1,2}, [0-9]{4}')
df['Date'] = df['Date'].apply(lambda d: date_regex.findall(d)[0]) # This doesn't work for some reason

#def choose_date(d):
#    d = date_regex.findall(d)[0]
    
#df['Date'] = df['Date'].apply(choose_date)

#df['Date'] = pd.to_datetime(df['Date'])

int_regex = re.compile('[0-9,]+')

def str_to_int(s):
    s = int_regex.findall(s)[0]
    s = s.replace(',','')
    return int(s)

df['Outlets'] = df['Outlets'].apply(str_to_int)

df.set_index(['Order'], inplace=True)
# df.sort_values(by=['Outlets', 'Date'], ascending=False)
# df.query('Outlets < 100')
# df.query('Date is not None')
# df.query('Date.isnull()')
# df.query('Date.notnull()')
# df.query('Outlets < 100 & Date < "01-06-1990"')
# df.query('Outlets < 100 | Date < "01-06-1990"')

df.head(10)

Unnamed: 0,Order,Country,Date,Location,Outlets
0,1,United States,"May 15, 1940Franchise: April 15, 1955","San Bernardino, CaliforniaDes Plaines, Illinoi...",13449
1,2,Canada (details),"June 3, 1967","Richmond, British Columbia(Reopened June 23, 2...",1466
2,3,Puerto Rico(territory of United States),"November 10, 1967",San Juan,95
3,4,United States Virgin Islands(territory of Unit...,"September 5, 1970",St. Croix,5
4,5,Costa Rica,"December 8, 1970","San José, 4th street, between 1st and Central ...",73
5,6,Australia,"May 30, 1971","Yagoona, New South Wales[13]",1032
6,7,Guam(territory of United States),"June 10, 1971",Dededo,5
7,8,Japan,"July 21, 1971","Ginza Mitsukoshi, Ginza, Chuo City, Tokyo",2982
8,9,Netherlands,"August 21, 1971",Zaandam,263
9,10,Panama,"September 1, 1971",Panama City,81
