The goal of this notebook is to scrape about 500 wikipedia pages with different subcategory labels. 

In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup as soup
import re
import pandas as pd
import time

In [2]:
master_url = "https://en.wikipedia.org/wiki/Category:Physics"

In [3]:
# a function to get the links for all the subcategories.
def wiki_subcategory_links(url):
    # Read the page into python with urlopen. Then close the connection afterwards.
    wiki_master = urlopen(url).read()
    # parse the file as html.
    wiki_master_parsed = soup(wiki_master,"html.parser")
    # find a div with class label CategoryTreeLabel
    wiki_subcategories=wiki_master_parsed.findAll("a",{"class":"CategoryTreeLabel"})
    
    # get the ref part 
    subcategory_links = dict()
    for subcategory in wiki_subcategories:
        # use the text as the key
        key = subcategory.text
        # get the sublink and add the full https link.
        value = "https://en.wikipedia.org"+str(subcategory['href'])
        
        subcategory_links[key] = value

    return subcategory_links

In [4]:
phys_subcategory_links = wiki_subcategory_links(master_url)

In [5]:
def scrape_method_1(html):
    # initialize the dictionary
    all_article_links = dict() 
    # find all the div labels with class mw-category-group
    wiki_articles = html.findAll('div',{"id":"mw-pages"})[0].findAll('a')
    
    for each_article in wiki_articles:
        # use the title as dictionary key
        key = each_article['title']
        # use the full link as dictionary value.
        value = "https://en.wikipedia.org"+str(each_article['href'])
        # assign the key value pair
        all_article_links[key] = value
                    
    return all_article_links  


# a function to get the links of all articles from one subcategory
def get_wiki_article_links(url):
    # Read the page into python with urlopen. Then close the connection afterwards.
    wiki_master = urlopen(url).read()
    # parse the file as html.
    wiki_master_parsed = soup(wiki_master,"html.parser") 
    article_links = dict()
    try:
        # the first structure to get the links and names
        article_links = scrape_method_1(wiki_master_parsed)
    except:
        print("The article pages are empty.")
    return article_links

In [6]:
all_wiki_articles = get_wiki_article_links(phys_subcategory_links['Concepts in physics'])

In [7]:
# convert the dictionaries for each subcategory into a dataframe with an index for subdirectory.
import pandas as pd

wiki_pd = list()

for article_name in phys_subcategory_links:
    
    all_article_in_this_subcategory = get_wiki_article_links(phys_subcategory_links[article_name])
    
    all_article_in_this_subcategory_df = pd.DataFrame.from_dict(all_article_in_this_subcategory,orient="index")
    
    all_article_in_this_subcategory_df["Subcategory"] = article_name
    
    wiki_pd.append(all_article_in_this_subcategory_df)


The article pages are empty.
The article pages are empty.


In [17]:
wiki_physics = pd.concat(wiki_pd,sort = False).reset_index()

In [19]:
wiki_physics.columns = ["Name",'Link',"Subcategory"]

In [21]:
wiki_physics.head()

Unnamed: 0,Name,Link,Subcategory
0,Wikipedia:FAQ/Categorization,https://en.wikipedia.org/wiki/Wikipedia:FAQ/Ca...,Concepts in physics
1,Category:Concepts in physics,https://en.wikipedia.org/w/index.php?title=Cat...,Concepts in physics
2,4D vector,https://en.wikipedia.org/wiki/4D_vector,Concepts in physics
3,Active and passive transformation,https://en.wikipedia.org/wiki/Active_and_passi...,Concepts in physics
4,Ansatz,https://en.wikipedia.org/wiki/Ansatz,Concepts in physics


In [20]:
wiki_physics['Subcategory'].value_counts()

Physics stubs                     202
Physical quantities               202
Concepts in physics               202
Physicists                        156
History of physics                136
Physics awards                     98
Gravitation                        95
Unsolved problems in physics       54
Physics-related lists              53
Equations of physics               50
Physical phenomena                 48
Physics organizations              42
Fringe physics                     40
Wikipedia books on physics         33
Physics in fiction                 31
Subfields of physics               24
Physical systems                   22
Time in physics                    20
Limits of computation              13
Physics events                     10
Interaction                         9
Physics literature                  8
Physical properties                 8
Physics websites                    7
Physics templates                   5
Standardized tests for Physics      2
Name: Subcat

In [22]:
wiki_physics.to_csv("wiki_phys_scrape.csv")