# Web Scraping - Wikipedia
### **Overview**: Mengambil data dari tabel **Daftar kota di Indonesia Berdasarkan Kepadatan Penduduk** yang terdapat di halaman Wikipedia.



### 1. Import Library/Packages

In [181]:
from bs4 import BeautifulSoup
import requests

### Define URL of the website to *scrape*

In [182]:
url = 'https://id.wikipedia.org/wiki/Daftar_kota_di_Indonesia_menurut_kepadatan_penduduk'

### 2. Send an HTTP GET request to the website

In [183]:
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')

### 3. Find the required data

In [184]:
# The table that want to scrape is at index 1

table = soup.find_all('table')[1]

In [185]:
# karena judul kolom table berada dalam tag <th>...<\th>

indonesia_title = table.find_all('th')
indonesia_title

[<th>No.
 </th>,
 <th>Kota
 </th>,
 <th>Provinsi
 </th>,
 <th>Luas wilayah<br/>(Km<sup>2</sup>)
 </th>,
 <th class="unsortable">Ref.
 </th>,
 <th>Jumlah<br/>penduduk<sup class="reference" id="cite_ref-1"><a href="#cite_note-1">[1]</a></sup>
 </th>,
 <th>Kepadatan<br/>per km<sup>2</sup>
 </th>]

In [186]:
indonesia_title_table = [title.text.strip() for title in indonesia_title]
print (indonesia_title_table)

['No.', 'Kota', 'Provinsi', 'Luas wilayah(Km2)', 'Ref.', 'Jumlahpenduduk[1]', 'Kepadatanper km2']


In [187]:
# Create the Data Frame
import pandas as pd

df = pd.DataFrame(columns = indonesia_title_table)
df

Unnamed: 0,No.,Kota,Provinsi,Luas wilayah(Km2),Ref.,Jumlahpenduduk[1],Kepadatanper km2


In [188]:
column_data = table.find_all('tr')

In [189]:
for row in column_data[1:]:
  row_data = row.find_all('td')
  individual_row_data = [data.text.strip() for data in row_data]
  # print(individual_row_data)

  length = len(df)
  df.loc[length] = individual_row_data

In [190]:
df

Unnamed: 0,No.,Kota,Provinsi,Luas wilayah(Km2),Ref.,Jumlahpenduduk[1],Kepadatanper km2
0,1,Jakarta Pusat,Daerah Khusus Ibukota Jakarta,4790,[2],889.448,18.569
1,2,Jakarta Barat,Daerah Khusus Ibukota Jakarta,12615,[2],2.093.013,16.591
2,3,Jakarta Selatan,Daerah Khusus Ibukota Jakarta,14573,[2],2.001.353,13.733
3,4,Bandung,Jawa Barat,16730,[3],2.288.570,13.679
4,5,Cimahi,Jawa Barat,4036,[4],546.879,13.549
...,...,...,...,...,...,...,...
93,94,Sorong,Papua Barat Daya,"1.105,00",[67],146.390,132
94,95,Dumai,Riau,"2.039,00",[48],218.643,107
95,96,Palangka Raya,Kalimantan Tengah,"2.400,00",[68],170.761,71
96,97,Subulussalam,Nanggroe Aceh Darussalam,"1.391,00",,78.801,56


In [191]:
# Mengganti beberapa nama kolom dan Menghapus kolom ref

df.rename(columns = {'Luas wilayah(Km2)': 'Luas_Wilayah', 'Jumlahpenduduk[1]': 'Jumlah_Penduduk', 'Kepadatanper km2': 'Kepadatan_Penduduk' }, inplace = True)
df.drop(columns =['Ref.'], inplace=True)

In [192]:
df.head()

Unnamed: 0,No.,Kota,Provinsi,Luas_Wilayah,Jumlah_Penduduk,Kepadatan_Penduduk
0,1,Jakarta Pusat,Daerah Khusus Ibukota Jakarta,4790,889.448,18.569
1,2,Jakarta Barat,Daerah Khusus Ibukota Jakarta,12615,2.093.013,16.591
2,3,Jakarta Selatan,Daerah Khusus Ibukota Jakarta,14573,2.001.353,13.733
3,4,Bandung,Jawa Barat,16730,2.288.570,13.679
4,5,Cimahi,Jawa Barat,4036,546.879,13.549


### 4. Save the data into .csv file

In [194]:
df.to_csv('data_kepadatan_penduduk_indonesia.csv', index=False)
from google.colab import files
files.download('data_kepadatan_penduduk_indonesia.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>