In [1]:
import pandas as pd

In [2]:
# URL for the 'California Median Age by City'
url = 'http://www.usa.com/rank/california-state--median-age--city-rank.htm'

In [3]:
# Use the read_html function in Pandas to automatically scrape any tabular data from a page
tables = pd.read_html(url)
tables

[          0             1                                           2
 0      Rank  Median Age ▲                           City / Population
 1        1.         19.10                    Rodriguez Camp, CA / 156
 2        2.         19.60  University Of California Davis, CA / 5,786
 3        3.         20.70                     Isla Vista, CA / 23,096
 4        4.         21.30            Camp Pendleton North, CA / 5,200
 5        5.         21.70           Camp Pendleton South, CA / 10,616
 6        6.         21.80                         Alpaugh, CA / 1,026
 7        7.         21.90                      Linnell Camp, CA / 849
 8        8.         22.10                           Oasis, CA / 6,890
 9        9.         22.40                        Tooleville, CA / 339
 10      10.         22.50                      Vandenberg, CA / 3,338
 11      10.         22.50                 Lemoore Station, CA / 7,438
 12      12.         22.60                      Lost Hills, CA / 2,412
 13   

In [4]:
# Create Data Frame to Grab Table Data
ages_df = tables[0]
ages_df.columns = ['Rank', 'Median Age', 'City/Population']
ages_df.head()

Unnamed: 0,Rank,Median Age,City/Population
0,Rank,Median Age ▲,City / Population
1,1.,19.10,"Rodriguez Camp, CA / 156"
2,2.,19.60,"University Of California Davis, CA / 5,786"
3,3.,20.70,"Isla Vista, CA / 23,096"
4,4.,21.30,"Camp Pendleton North, CA / 5,200"


In [5]:
# Split the 'City/Population' column to isolate the city
ages_df[['City','Population']] = ages_df['City/Population'].str.split('/',expand=True)
ages_df.head() 

Unnamed: 0,Rank,Median Age,City/Population,City,Population
0,Rank,Median Age ▲,City / Population,City,Population
1,1.,19.10,"Rodriguez Camp, CA / 156","Rodriguez Camp, CA",156
2,2.,19.60,"University Of California Davis, CA / 5,786","University Of California Davis, CA",5786
3,3.,20.70,"Isla Vista, CA / 23,096","Isla Vista, CA",23096
4,4.,21.30,"Camp Pendleton North, CA / 5,200","Camp Pendleton North, CA",5200


In [6]:
# Delete unwanted columns using the columns parameter of drop
clean_ages_df = ages_df.drop(["Rank", "City/Population", "Population"], axis=1)
clean_ages_df.head()

Unnamed: 0,Median Age,City
0,Median Age ▲,City
1,19.10,"Rodriguez Camp, CA"
2,19.60,"University Of California Davis, CA"
3,20.70,"Isla Vista, CA"
4,21.30,"Camp Pendleton North, CA"


In [7]:
# Remove the 'CA' from the end of each city name
clean_ages_df['City'] = clean_ages_df['City'].str.replace(', CA','')
clean_ages_df.head()

Unnamed: 0,Median Age,City
0,Median Age ▲,City
1,19.10,Rodriguez Camp
2,19.60,University Of California Davis
3,20.70,Isla Vista
4,21.30,Camp Pendleton North


In [8]:
# Delete the first row to remove redundant header
clean_ages_final_df = clean_ages_df.drop(clean_ages_df.index[0])
clean_ages_final_df.head()

Unnamed: 0,Median Age,City
1,19.1,Rodriguez Camp
2,19.6,University Of California Davis
3,20.7,Isla Vista
4,21.3,Camp Pendleton North
5,21.7,Camp Pendleton South


In [9]:
# Re-Arrange the order of the columns
clean_ages_final_df = clean_ages_final_df[['City','Median Age']]
clean_ages_final_df.head()

Unnamed: 0,City,Median Age
1,Rodriguez Camp,19.1
2,University Of California Davis,19.6
3,Isla Vista,20.7
4,Camp Pendleton North,21.3
5,Camp Pendleton South,21.7


In [10]:
# Set the index to the 'City' column
clean_ages_final_df.set_index('City', inplace=True)
clean_ages_final_df.head()

Unnamed: 0_level_0,Median Age
City,Unnamed: 1_level_1
Rodriguez Camp,19.1
University Of California Davis,19.6
Isla Vista,20.7
Camp Pendleton North,21.3
Camp Pendleton South,21.7


In [11]:
# Save the table directly to an html file
clean_ages_final_df.to_html('Resources/clean_age_table.html')

In [None]:
# Export dataframe to CSV file
clean_ages_final_df.to_csv('Resources/clean_age.csv')