# Scrape a table from a page with python

In [1]:
import pandas as pd

### Scraping Table Data From Websites

In [2]:
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
tables = pd.read_html(URL)
df=tables[0]

### The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood

In [3]:
df.rename(columns={'Postal Code': 'PostalCode','Borough':'Borough','Neighbourhood':'Neighbourhood'}, inplace=True)

### Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [4]:
df = df[df.Borough != "Not assigned"]
df.reset_index(drop=True,inplace=True)

### The rows with the same postal code area will be combined into one row with the neighborhoods separated with a comma 

In [5]:
df.groupby(['PostalCode']).sum()

Unnamed: 0_level_0,Borough,Neighbourhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Malvern, Rouge"
M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
...,...,...
M9N,York,Weston
M9P,Etobicoke,Westmount
M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


### If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough

In [6]:
df[df.Neighbourhood == "Not assigned"].Neighbourhood=df[df.Neighbourhood == "Not assigned"].Borough
df.head(12)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


### Use the .shape method to print the number of rows of your dataframe

In [7]:
df.shape

(103, 3)