# R DATASETS

In [1]:
import pandas 
import requests
import bs4

In [2]:
datasets_url = 'https://vincentarelbundock.github.io/Rdatasets/datasets.html'
base_url = 'https://vincentarelbundock.github.io/Rdatasets/'
folder_path = './RDATASETS/'

In [3]:
r = requests.get(datasets_url)

In [4]:
soup = bs4.BeautifulSoup(r.content)

In [5]:
table = soup.find('table')

### Get Table Headers 

In [6]:
ths = table.find_all('th')

In [7]:
headers = [th.text.strip() for th in ths]

In [8]:
headers

['Package',
 'Item',
 'Title',
 'Rows',
 'Cols',
 'has_logical',
 'has_binary',
 'has_numeric',
 'has_character',
 'CSV',
 'Doc']

###  Store Rows

In [9]:
trs = table.find_all('tr')

In [10]:
len(trs)

1245

In [11]:
trs[2]

<tr>
<td class="cellinside">boot
</td>
<td class="cellinside">acme
</td>
<td class="cellinside">Monthly Excess Returns
</td>
<td class="cellinside">    60
</td>
<td class="cellinside">   3
</td>
<td class="cellinside">FALSE
</td>
<td class="cellinside">FALSE
</td>
<td class="cellinside"> TRUE
</td>
<td class="cellinside"> TRUE
</td>
<td class="cellinside"><a href="csv/boot/acme.csv"> CSV </a>
</td>
<td class="cellinside"><a href="doc/boot/acme.html"> DOC </a>
</td></tr>

### Create Empty Frame

In [12]:
frame = pandas.DataFrame()

### Filter trs of same length

In [22]:
trs = [tr for tr in trs if len(tr) == 22]

###  Process table data

In [34]:
for row_index, row in enumerate(trs):
    items = row.find_all('td')
    for index, item in enumerate(items):
        column_header = headers[index]
        # IF HAS LINK
        if index >=9:
            frame.loc[row_index, column_header] = item.find('a').get('href')
        # ELSE NO LINK
        else:
            frame.loc[row_index, column_header] = item.text.strip()

In [43]:
frame.shape

(1243, 11)

In [46]:
frame['Rows'] = frame['Rows'].astype(int)
frame['Cols'] = frame['Cols'].astype(int)

In [47]:
frame

Unnamed: 0,Package,Item,Title,Rows,Cols,has_logical,has_binary,has_numeric,has_character,CSV,Doc
0,boot,acme,Monthly Excess Returns,60,3,FALSE,FALSE,TRUE,TRUE,csv/boot/acme.csv,doc/boot/acme.html
1,boot,aids,Delay in AIDS Reporting in England and Wales,570,6,FALSE,TRUE,TRUE,FALSE,csv/boot/aids.csv,doc/boot/aids.html
2,boot,aircondit,Failures of Air-conditioning Equipment,12,1,FALSE,FALSE,TRUE,FALSE,csv/boot/aircondit.csv,doc/boot/aircondit.html
3,boot,aircondit7,Failures of Air-conditioning Equipment,24,1,FALSE,FALSE,TRUE,FALSE,csv/boot/aircondit7.csv,doc/boot/aircondit7.html
4,boot,amis,Car Speeding and Warning Signs,8437,4,FALSE,TRUE,TRUE,FALSE,csv/boot/amis.csv,doc/boot/amis.html
5,boot,aml,Remission Times for Acute Myelogenous Leukaemia,23,3,FALSE,TRUE,TRUE,FALSE,csv/boot/aml.csv,doc/boot/aml.html
6,boot,beaver,Beaver Body Temperature Data,100,4,FALSE,TRUE,TRUE,FALSE,csv/boot/beaver.csv,doc/boot/beaver.html
7,boot,bigcity,Population of U.S. Cities,49,2,FALSE,FALSE,TRUE,FALSE,csv/boot/bigcity.csv,doc/boot/bigcity.html
8,boot,brambles,Spatial Location of Bramble Canes,823,3,FALSE,FALSE,TRUE,FALSE,csv/boot/brambles.csv,doc/boot/brambles.html
9,boot,breslow,Smoking Deaths Among Doctors,10,5,FALSE,TRUE,TRUE,FALSE,csv/boot/breslow.csv,doc/boot/breslow.html


### Create unique identifier

In [None]:
frame['Unique_Name'] = frame['Package'] + '_' + frame['Item']

### Export 

In [62]:
frame.to_excel('./table.xlsx', index_label='row_index', index = True)