In [1]:
import pandas as pd
import numpy as np
import wikipedia
import requests
from bs4 import BeautifulSoup
import time

In [2]:
print(wikipedia.WikipediaPage(title = 'Celine Dion Live 2018').summary)

Celine Dion Live 2018 was a concert tour by Canadian singer Celine Dion to support her greatest hits album, The Best So Far... 2018 Tour Edition (2018). It marked Dion's first concerts in Asia and Australia since the Taking Chances World Tour in 2008. The tour began on 26 June 2018 in Tokyo, Japan and concluded on 14 August 2018 in Auckland, New Zealand, making a total of 22 concerts performed. The tour grossed $56.5 million.


In [3]:
section = wikipedia.WikipediaPage('Celine Dion Live 2018').section('Commercial reception')
section = section.replace('\n','').replace("\'","")

In [4]:
section

'Dion brought her 2018 summer tour to a close with grosses of $56.5 million. Played across 22 shows in Asia, Australia, and New Zealand, she has sold 259,443 tickets. The biggest engagement of the tour was a three-night stay at Taiwans Taipei Arena with over $10.7 million grossed, though her one show at the Tokyo Dome delivered the biggest attendance tally, at 42,748.'

In [5]:
# first pull the HTML from the page that links to all of the pages with the links.
# in this case, this page gives the links list pages of sci-fi films by decade.
# just go to https://en.wikipedia.org/wiki/Lists_of_science_fiction_films
# to see what I'm pulling from.
html = requests.get('https://en.wikipedia.org/wiki/Lists_of_science_fiction_films')

#turn the HTML into a beautiful soup text object
b = BeautifulSoup(html.text, 'lxml')
# create an mpty list where those links will go.
links = []

# in this case, all of the links we're in a '<li>' brackets.
for i in b.find_all(name = 'li'):
    # pull the actual link for each one
    for link in i.find_all('a', href=True):
        links.append(link['href'])
# the above code ends up pulling more links than I want,
# so I just use the ones I want
links = links[1:11]
# each link only returns something like 'wiki/List_of_science_fiction_films_of_the_1920s'
# so I add the other part of the URL to each.
decade_links = ['https://en.wikipedia.org' + i for i in links]

# create two new lists, one for the title of the page, 
# and one for the link to the page
film_titles = []
film_links = []
# for loop to pull from each decade page with list of films.
# look at https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1920s
# to follow along as an exampe
for decade in decade_links:
    print(f'Collecting films from {decade}')
    html = requests.get(decade)
    b = BeautifulSoup(html.text, 'lxml')
    # get to the table on the page
    for i in b.find_all(name='table', class_='wikitable'):
        # get to the row of each film
        for j in i.find_all(name='tr'):
            #get just the title cell for each row.
            # contains the title and the URL
            for k in j.find_all(name='i'):
                # get within that cell to just get the words
                for link in k.find_all('a', href=True):
                    # get the title and add to the list
                    film_titles.append(link['title'])
                    # get the link and add to that list
                    film_links.append(link['href'])
    #be a conscientious scraper and pause between scrapes
    time.sleep(1)
print(f'Number of Film Links Collected: {len(film_links)}')
print(f'Number of Film Titles Collected: {len(film_titles)}')
# remove film links that don't have a description page on Wikipedia
new_film_links = [i for i in film_links if 'redlink' not in i]
# same goes for titles
new_film_titles = [i for i in film_titles if '(page does not exist)' not in i]
print(f'Number of Film Links with Wikipedia Pages: {len(new_film_links)}')
print(f'Number of Film Titles with Wikipedia Pages: {len(new_film_titles)}')
#use this list to fetch from the API
title_links = list(zip(new_film_titles, new_film_links))

Collecting films from https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1920s
Collecting films from https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1930s
Collecting films from https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1940s
Collecting films from https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1950s
Collecting films from https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1960s
Collecting films from https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1970s
Collecting films from https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1980s
Collecting films from https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1990s
Collecting films from https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_2000s
Collecting films from https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_2010s
Number of Film Links Collected: 1862
Number of Fil

# CELINE DION

In [6]:
celine_url = requests.get('https://en.wikipedia.org/wiki/Celine_Dion_Live_2018').text

In [7]:
soup = BeautifulSoup(celine_url,'lxml')
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Celine Dion Live 2018 - Wikipedia
  </title>
  <script>
   document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" );
  </script>
  <script>
   (window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Celine_Dion_Live_2018","wgTitle":"Celine Dion Live 2018","wgCurRevisionId":868933633,"wgRevisionId":868933633,"wgArticleId":56211618,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Use dmy dates from January 2018","2018 concert tours","Celine Dion concert tours"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames

In [8]:
My_table = soup.find('table',{'class':'wikitable'})

In [9]:
My_table

<table class="wikitable" style="text-align:center;">
<caption>List of concerts, showing date, city, country, venue, tickets sold, amount of available tickets and gross revenue
</caption>
<tbody><tr>
<th scope="col" style="width:12em;">Date
</th>
<th scope="col" style="width:10em;">City
</th>
<th scope="col" style="width:10em;">Country
</th>
<th scope="col" style="width:16em;">Venue
</th>
<th scope="col" style="width:10em;">Opening act
</th>
<th scope="col" style="width:10em;">Attendance
</th>
<th scope="col" style="width:10em;">Revenue
</th></tr>
<tr>
<th colspan="7">Asia<sup class="reference" id="cite_ref-dates_6-4"><a href="#cite_note-dates-6">[6]</a></sup><sup class="reference" id="cite_ref-boxscores_17-0"><a href="#cite_note-boxscores-17">[17]</a></sup>
</th></tr>
<tr>
<td>26 June 2018
</td>
<td><a href="/wiki/Tokyo" title="Tokyo">Tokyo</a>
</td>
<td>Japan
</td>
<td><a href="/wiki/Tokyo_Dome" title="Tokyo Dome">Tokyo Dome</a>
</td>
<td rowspan="12"><a href="/wiki/V%C3%A9ronic_DiCai

In [10]:
links = My_table.findAll('a')
links

[<a href="#cite_note-dates-6">[6]</a>,
 <a href="#cite_note-boxscores-17">[17]</a>,
 <a href="/wiki/Tokyo" title="Tokyo">Tokyo</a>,
 <a href="/wiki/Tokyo_Dome" title="Tokyo Dome">Tokyo Dome</a>,
 <a href="/wiki/V%C3%A9ronic_DiCaire" title="Véronic DiCaire">Véronic DiCaire</a>,
 <a href="/wiki/Macau" title="Macau">Macau</a>,
 <a href="/wiki/Cotai_Arena" title="Cotai Arena">Cotai Arena</a>,
 <a href="/wiki/Singapore" title="Singapore">Singapore</a>,
 <a href="/wiki/Marina_Bay_Sands" title="Marina Bay Sands">Marina Bay Sands Grand Ballroom</a>,
 <a href="/wiki/Jakarta" title="Jakarta">Jakarta</a>,
 <a href="/wiki/Sentul_International_Convention_Center" title="Sentul International Convention Center">Sentul International Convention Center</a>,
 <a href="/wiki/Taipei" title="Taipei">Taipei</a>,
 <a href="/wiki/Taipei_Arena" title="Taipei Arena">Taipei Arena</a>,
 <a href="/wiki/Manila" title="Manila">Manila</a>,
 <a href="/wiki/Mall_of_Asia_Arena" title="Mall of Asia Arena">Mall of Asia Aren

In [11]:
Countries = []
for link in links :
    Countries.append(link.get('title'))
print(Countries)

[None, None, 'Tokyo', 'Tokyo Dome', 'Véronic DiCaire', 'Macau', 'Cotai Arena', 'Singapore', 'Marina Bay Sands', 'Jakarta', 'Sentul International Convention Center', 'Taipei', 'Taipei Arena', 'Manila', 'Mall of Asia Arena', 'Bangkok', 'Impact, Muang Thong Thani', None, None, None, 'Sydney', 'Qudos Bank Arena', 'Brisbane', 'Brisbane Entertainment Centre', 'Perth', 'Perth Arena', 'Melbourne', 'Rod Laver Arena', 'Auckland', 'Spark Arena']


In [43]:
df = pd.DataFrame()
df['Country'] = Countries
df

Unnamed: 0,Country
0,
1,
2,Tokyo
3,Tokyo Dome
4,Véronic DiCaire
5,Macau
6,Cotai Arena
7,Singapore
8,Marina Bay Sands
9,Jakarta


In [13]:
links_1 = My_table.findAll('td')
links_1

[<td>26 June 2018
 </td>, <td><a href="/wiki/Tokyo" title="Tokyo">Tokyo</a>
 </td>, <td>Japan
 </td>, <td><a href="/wiki/Tokyo_Dome" title="Tokyo Dome">Tokyo Dome</a>
 </td>, <td rowspan="12"><a href="/wiki/V%C3%A9ronic_DiCaire" title="Véronic DiCaire">Véronic DiCaire</a>
 </td>, <td>42,748 / 42,748
 </td>, <td>$5,784,430
 </td>, <td>29 June 2018
 </td>, <td rowspan="2"><a href="/wiki/Macau" title="Macau">Macau</a>
 </td>, <td rowspan="2">China
 </td>, <td rowspan="2"><a href="/wiki/Cotai_Arena" title="Cotai Arena">Cotai Arena</a>
 </td>, <td rowspan="2">19,809 / 19,809
 </td>, <td rowspan="2">$4,017,395
 </td>, <td>30 June 2018
 </td>, <td>3 July 2018
 </td>, <td rowspan="2"><a href="/wiki/Singapore" title="Singapore">Singapore</a>
 </td>, <td rowspan="2">Singapore
 </td>, <td rowspan="2"><a href="/wiki/Marina_Bay_Sands" title="Marina Bay Sands">Marina Bay Sands Grand Ballroom</a>
 </td>, <td rowspan="2">12,516 / 12,516
 </td>, <td rowspan="2">$4,203,989
 </td>, <td>4 July 2018
 </td>

In [14]:
Date = []
for link_1 in links_1 :
    Countries.append(link_1.get('rowspan'))
print(Date)

[]


In [15]:
import wikipedia, re

def get_hispanic_population_data():
    #"Get the details of hispanic and latino population by state/territory"
    wiki_search_string = "hispanic and latino population"
    wiki_page_title = "List of U.S. states by Hispanic and Latino population"
    wiki_table_caption = "Hispanic and Latino Population by state or territory"
    parsed_table_data = []
 
    search_results = wikipedia.search(wiki_search_string)
    for result in search_results:
        if wiki_page_title in result:
            my_page = wikipedia.page(result)
            #download the HTML source
            soup = BeautifulSoup(my_page.html(),"lxml")
            #Using a simple regex to do 'caption contains string'
            table = soup.find('caption',text=re.compile(r'%s'%wiki_table_caption)).findParent('table')
            rows = table.findAll('tr')
            for row in rows:
                children = row.findChildren(recursive=False)
                row_text = []
                for child in children:
                    clean_text = child.text
                    #This is to discard reference/citation links
                    clean_text = clean_text.split('&#91;')[0]
                    #This is to clean the header row of the sort icons
                    clean_text = clean_text.split('&#160;')[-1]
                    clean_text = clean_text.strip()
                    row_text.append(clean_text)
                parsed_table_data.append(row_text)
    return parsed_table_data
 
#----START OF SCRIPT
if __name__=="__main__":
    print ('Hispanic and Latino population data in the USA looks like this:\n\n')
    hispanic_population_data = get_hispanic_population_data() 
    for row in hispanic_population_data:
        print ('|'.join(row))

Hispanic and Latino population data in the USA looks like this:




AttributeError: 'NoneType' object has no attribute 'findParent'

In [42]:
#Get the html source
html = wikipedia.page("List of U.S. states by Hispanic and Latino population").html().encode("UTF-8")
df_3 = pd.read_html(html)[0]
df_3.to_csv('beautifulsoup_pandas.csv',header=0,index=False)
print (df_3)

                           0         1           2          3           4  \
0            State/Territory  Pop 2000  % pop 2000   Pop 2010  % pop 2010   
1                    Alabama     75830        1.7%     185602        3.9%   
2                     Alaska     25852        4.1%      39250        5.5%   
3                    Arizona   1295617       25.3%    1895149       29.6%   
4                   Arkansas     86866        3.2%     186050        6.4%   
5                 California  10966556       32.4%   14013719       37.6%   
6                   Colorado    735801       17.1%    1038687       20.7%   
7                Connecticut    320323        9.4%     479087       13.4%   
8                   Delaware     37277        4.8%      73221        8.2%   
9       District of Columbia     44953        7.9%      54749        9.1%   
10                   Florida   2682715       16.8%    4223806       22.5%   
11                   Georgia    435227        5.3%     853689        8.8%   

In [22]:
saved_rowspans = []
for row in My_table.findAll("tr"):
    cells = row.findAll(["th", "td"])

    if len(saved_rowspans) == 0:
        saved_rowspans = [None for _ in cells]


In [23]:
saved_rowspans

[None, None, None, None, None, None, None]

In [24]:
cells

[<th colspan="5">Total
 </th>,
 <th scope="row" style="text-align:center;"><b>259,443 / 259,487 (~100%)</b>
 </th>,
 <th scope="row" style="text-align:center;"><b>$56,476,891</b>
 </th>]

In [35]:
from wikitables import import_tables
tables = import_tables('Celine Dion Live 2018')

In [49]:
#print(tables[0].name)
Date =[]
for row in tables[0].rows:
    Date.append('{Date}'.format(**row))
print(Date)

['Asia', '26 June 2018', '29 June 2018', '30 June 2018', '3 July 2018', '4 July 2018', '7 July 2018', '11 July 2018', '13 July 2018', '14 July 2018', '19 July 2018', '20 July 2018', '23 July 2018', 'Oceania', '27 July 2018', '28 July 2018', '30 July 2018', '31 July 2018', '4 August 2018', '7 August 2018', '8 August 2018', '11 August 2018', '12 August 2018', '14 August 2018', 'Total']


In [51]:
df_1 = pd.DataFrame()
df_1['Date'] = Date
df_1

Unnamed: 0,Date
0,Asia
1,26 June 2018
2,29 June 2018
3,30 June 2018
4,3 July 2018
5,4 July 2018
6,7 July 2018
7,11 July 2018
8,13 July 2018
9,14 July 2018


In [52]:
Attendance =[]
for row in tables[0].rows:
    Date.append('{Attendance}'.format(**row))
print(Attendance)

KeyError: 'Attendance'

In [57]:
section_Lauv = wikipedia.WikipediaPage('Lauv').section('Career')
section_Lauv = section_Lauv.replace('\n','').replace("\'","")

In [58]:
section_Lauv

''