In [1]:
import pandas as pd
import numpy as np
pd.set_option('max_columns', 4, 'max_rows', 10, 'max_colwidth', 12)

### How to do it\...

In [2]:
fname = ['Paul', 'John', 'Richard', 'George']
lname = ['McCartney', 'Lennon', 'Starkey', 'Harrison']
birth = [1942, 1940, 1940, 1943]

In [3]:
people = {'first': fname, 'last': lname, 'birth': birth}

In [4]:
beatles = pd.DataFrame(people)
beatles

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


### How it works\...

In [5]:
beatles.index

RangeIndex(start=0, stop=4, step=1)

In [6]:
pd.DataFrame(people, index=['a', 'b', 'c', 'd'])

Unnamed: 0,first,last,birth
a,Paul,McCartney,1942
b,John,Lennon,1940
c,Richard,Starkey,1940
d,George,Harrison,1943


### There\'s More

In [7]:
pd.DataFrame(
[{"first":"Paul","last":"McCartney", "birth":1942},
 {"first":"John","last":"Lennon", "birth":1940},
 {"first":"Richard","last":"Starkey", "birth":1940},
 {"first":"George","last":"Harrison", "birth":1943}])

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [8]:
[{"first":"Paul","last":"McCartney", "birth":1942},
 {"first":"John","last":"Lennon", "birth":1940},
 {"first":"Richard","last":"Starkey", "birth":1940},
 {"first":"George","last":"Harrison", "birth":1943}],
 columns=['last', 'first', 'birth'])

IndentationError: unexpected indent (<ipython-input-8-0f8e2cc98582>, line 5)

### How to do it\...

In [9]:
beatles

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [10]:
from io import StringIO
fout = StringIO()
beatles.to_csv(fout)  # use a filename instead of fout

In [11]:
print(fout.getvalue())

,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943



### There\'s More

In [12]:
_ = fout.seek(0)
pd.read_csv(fout)

Unnamed: 0.1,Unnamed: 0,first,last,birth
0,0,Paul,McCartney,1942
1,1,John,Lennon,1940
2,2,Richard,Starkey,1940
3,3,George,Harrison,1943


In [13]:
_ = fout.seek(0)
pd.read_csv(fout, index_col=0)

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [14]:
fout = StringIO()
beatles.to_csv(fout, index=False) 
print(fout.getvalue())

first,last,birth
Paul,McCartney,1942
John,Lennon,1940
Richard,Starkey,1940
George,Harrison,1943



### How to do it\...

In [15]:
diamonds = pd.read_csv('data/diamonds.csv', nrows=1000)
diamonds

Unnamed: 0,carat,cut,...,y,z
0,0.23,Ideal,...,3.98,2.43
1,0.21,Premium,...,3.84,2.31
2,0.23,Good,...,4.07,2.31
3,0.29,Premium,...,4.23,2.63
4,0.31,Good,...,4.35,2.75
...,...,...,...,...,...
995,0.54,Ideal,...,5.34,3.26
996,0.72,Ideal,...,5.74,3.57
997,0.72,Good,...,5.89,3.48
998,0.74,Premium,...,5.77,3.58


In [16]:
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    1000 non-null   float64
 1   cut      1000 non-null   object 
 2   color    1000 non-null   object 
 3   clarity  1000 non-null   object 
 4   depth    1000 non-null   float64
 5   table    1000 non-null   float64
 6   price    1000 non-null   int64  
 7   x        1000 non-null   float64
 8   y        1000 non-null   float64
 9   z        1000 non-null   float64
dtypes: float64(6), int64(1), object(3)
memory usage: 78.2+ KB


In [17]:
diamonds2 = pd.read_csv('data/diamonds.csv', nrows=1000,
    dtype={'carat': np.float32, 'depth': np.float32,
           'table': np.float32, 'x': np.float32,
           'y': np.float32, 'z': np.float32,
           'price': np.int16})

In [18]:
diamonds2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    1000 non-null   float32
 1   cut      1000 non-null   object 
 2   color    1000 non-null   object 
 3   clarity  1000 non-null   object 
 4   depth    1000 non-null   float32
 5   table    1000 non-null   float32
 6   price    1000 non-null   int16  
 7   x        1000 non-null   float32
 8   y        1000 non-null   float32
 9   z        1000 non-null   float32
dtypes: float32(6), int16(1), object(3)
memory usage: 49.0+ KB


In [19]:
diamonds.describe()

Unnamed: 0,carat,depth,...,y,z
count,1000.0,1000.0,...,1000.0,1000.0
mean,0.68928,61.7228,...,5.59918,3.45753
std,0.195291,1.758879,...,0.611974,0.389819
min,0.2,53.0,...,3.75,2.27
25%,0.7,60.9,...,5.63,3.45
50%,0.71,61.8,...,5.76,3.55
75%,0.79,62.6,...,5.91,3.64
max,1.27,69.5,...,7.05,4.33


In [20]:
diamonds2.describe()

Unnamed: 0,carat,depth,...,y,z
count,1000.0,1000.0,...,1000.0,1000.0
mean,0.689281,61.722824,...,5.59918,3.457533
std,0.195291,1.758878,...,0.611972,0.389819
min,0.2,53.0,...,3.75,2.27
25%,0.7,60.900002,...,5.63,3.45
50%,0.71,61.799999,...,5.76,3.55
75%,0.79,62.599998,...,5.91,3.64
max,1.27,69.5,...,7.05,4.33


In [21]:
diamonds2.cut.value_counts()

Ideal        333
Premium      290
Very Good    226
Good          89
Fair          62
Name: cut, dtype: int64

In [22]:
diamonds2.color.value_counts()

E    240
F    226
G    139
D    129
H    125
I     95
J     46
Name: color, dtype: int64

In [23]:
diamonds2.clarity.value_counts()

SI1     306
VS2     218
VS1     159
SI2     154
VVS2     62
VVS1     58
I1       29
IF       14
Name: clarity, dtype: int64

In [24]:
diamonds3 = pd.read_csv('data/diamonds.csv', nrows=1000,
    dtype={'carat': np.float32, 'depth': np.float32,
           'table': np.float32, 'x': np.float32,
           'y': np.float32, 'z': np.float32,
           'price': np.int16,
           'cut': 'category', 'color': 'category',
           'clarity': 'category'})

In [25]:
diamonds3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    1000 non-null   float32 
 1   cut      1000 non-null   category
 2   color    1000 non-null   category
 3   clarity  1000 non-null   category
 4   depth    1000 non-null   float32 
 5   table    1000 non-null   float32 
 6   price    1000 non-null   int16   
 7   x        1000 non-null   float32 
 8   y        1000 non-null   float32 
 9   z        1000 non-null   float32 
dtypes: category(3), float32(6), int16(1)
memory usage: 29.4 KB


In [26]:
np.iinfo(np.int8)

iinfo(min=-128, max=127, dtype=int8)

In [27]:
np.finfo(np.float16)

finfo(resolution=0.001, min=-6.55040e+04, max=6.55040e+04, dtype=float16)

In [28]:
cols = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price']
diamonds4 = pd.read_csv('data/diamonds.csv', nrows=1000,
    dtype={'carat': np.float32, 'depth': np.float32,
           'table': np.float32, 'price': np.int16,
           'cut': 'category', 'color': 'category',
           'clarity': 'category'},
    usecols=cols)

In [29]:
diamonds4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    1000 non-null   float32 
 1   cut      1000 non-null   category
 2   color    1000 non-null   category
 3   clarity  1000 non-null   category
 4   depth    1000 non-null   float32 
 5   table    1000 non-null   float32 
 6   price    1000 non-null   int16   
dtypes: category(3), float32(3), int16(1)
memory usage: 17.7 KB


In [30]:
cols = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price']
diamonds_iter = pd.read_csv('data/diamonds.csv', nrows=1000,
    dtype={'carat': np.float32, 'depth': np.float32,
           'table': np.float32, 'price': np.int16,
           'cut': 'category', 'color': 'category',
           'clarity': 'category'},
    usecols=cols,
    chunksize=200)

In [31]:
def process(df):
    return f'processed {df.size} items'

In [32]:
for chunk in diamonds_iter:
    process(chunk)

### How it works\...

### There\'s more \...

In [33]:
diamonds.price.memory_usage()

8128

In [34]:
diamonds.price.memory_usage(index=False)

8000

In [35]:
diamonds.cut.memory_usage()

8128

In [36]:
diamonds.cut.memory_usage(deep=True)

63461

In [37]:
diamonds4.to_feather('/tmp/d.arr')
diamonds5 = pd.read_feather('/tmp/d.arr')

ImportError: Missing optional dependency 'pyarrow'.  Use pip or conda to install pyarrow.

In [None]:
diamonds4.to_parquet('/tmp/d.pqt')

### How to do it\...

In [38]:
beatles.to_excel('/tmp/beat.xls')

In [39]:
beatles.to_excel('/tmp/beat.xlsx')

In [40]:
beat2 = pd.read_excel('/tmp/beat.xls')
beat2

Unnamed: 0.1,Unnamed: 0,first,last,birth
0,0,Paul,McCartney,1942
1,1,John,Lennon,1940
2,2,Richard,Starkey,1940
3,3,George,Harrison,1943


In [41]:
beat2 = pd.read_excel('/tmp/beat.xls', index_col=0)
beat2

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [42]:
beat2.dtypes

first    object
last     object
birth     int64
dtype: object

### How it works\...

### There\'s more\...

In [43]:
xl_writer = pd.ExcelWriter('/tmp/beat.xlsx')
beatles.to_excel(xl_writer, sheet_name='All')
beatles[beatles.birth < 1941].to_excel(xl_writer, sheet_name='1940')
xl_writer.save()

### How to do it\...

In [44]:
autos = pd.read_csv('data/vehicles.csv.zip')
autos

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,barrels08,barrelsA08,...,phevHwy,phevComb
0,15.695714,0.0,...,0,0
1,29.964545,0.0,...,0,0
2,12.207778,0.0,...,0,0
3,29.964545,0.0,...,0,0
4,17.347895,0.0,...,0,0
...,...,...,...,...,...
39096,14.982273,0.0,...,0,0
39097,14.330870,0.0,...,0,0
39098,15.695714,0.0,...,0,0
39099,15.695714,0.0,...,0,0


In [45]:
autos.modifiedOn.dtype

dtype('O')

In [46]:
autos.modifiedOn

0        Tue Jan ...
1        Tue Jan ...
2        Tue Jan ...
3        Tue Jan ...
4        Tue Jan ...
            ...     
39096    Tue Jan ...
39097    Tue Jan ...
39098    Tue Jan ...
39099    Tue Jan ...
39100    Tue Jan ...
Name: modifiedOn, Length: 39101, dtype: object

In [47]:
pd.to_datetime(autos.modifiedOn)  # doctest: +SKIP



0       2013-01-01
1       2013-01-01
2       2013-01-01
3       2013-01-01
4       2013-01-01
           ...    
39096   2013-01-01
39097   2013-01-01
39098   2013-01-01
39099   2013-01-01
39100   2013-01-01
Name: modifiedOn, Length: 39101, dtype: datetime64[ns]

In [48]:
autos = pd.read_csv('data/vehicles.csv.zip',
    parse_dates=['modifiedOn'])  # doctest: +SKIP
autos.modifiedOn

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


0       2013-01-01
1       2013-01-01
2       2013-01-01
3       2013-01-01
4       2013-01-01
           ...    
39096   2013-01-01
39097   2013-01-01
39098   2013-01-01
39099   2013-01-01
39100   2013-01-01
Name: modifiedOn, Length: 39101, dtype: datetime64[ns]

In [49]:
import zipfile

In [50]:
with zipfile.ZipFile('data/kaggle-survey-2018.zip') as z:
    print('\n'.join(z.namelist()))
    kag = pd.read_csv(z.open('multipleChoiceResponses.csv'))
    kag_questions = kag.iloc[0]
    survey = kag.iloc[1:]

multipleChoiceResponses.csv
freeFormResponses.csv
SurveySchema.csv


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [51]:
print(survey.head(2).T)

                        1          2
Time from...          710        434
Q1                 Female       Male
Q1_OTHER_...           -1         -1
Q2                  45-49      30-34
Q3            United S...  Indonesia
...                   ...        ...
Q50_Part_5            NaN        NaN
Q50_Part_6            NaN        NaN
Q50_Part_7            NaN        NaN
Q50_Part_8            NaN        NaN
Q50_OTHER...           -1         -1

[395 rows x 2 columns]


### How it works\...

### There\'s more\...

### How to do it\...

In [52]:
import sqlite3
con = sqlite3.connect('data/beat.db')
with con:
    cur = con.cursor()
    cur.execute("""DROP TABLE Band""")
    cur.execute("""CREATE TABLE Band(id INTEGER PRIMARY KEY,
        fname TEXT, lname TEXT, birthyear INT)""")
    cur.execute("""INSERT INTO Band VALUES(
        0, 'Paul', 'McCartney', 1942)""")
    cur.execute("""INSERT INTO Band VALUES(
        1, 'John', 'Lennon', 1940)""")
    _ = con.commit()

In [53]:
import sqlalchemy as sa
engine = sa.create_engine(
  'sqlite:///data/beat.db', echo=True)
sa_connection = engine.connect()

2021-04-19 18:56:33,980 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2021-04-19 18:56:33,981 INFO sqlalchemy.engine.base.Engine ()
2021-04-19 18:56:33,982 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2021-04-19 18:56:33,983 INFO sqlalchemy.engine.base.Engine ()


In [54]:
beat = pd.read_sql('Band', sa_connection, index_col='id')
beat

2021-04-19 18:56:34,244 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_info("Band")
2021-04-19 18:56:34,245 INFO sqlalchemy.engine.base.Engine ()
2021-04-19 18:56:34,247 INFO sqlalchemy.engine.base.Engine SELECT name FROM sqlite_master WHERE type='table' ORDER BY name
2021-04-19 18:56:34,248 INFO sqlalchemy.engine.base.Engine ()
2021-04-19 18:56:34,251 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_xinfo("Band")
2021-04-19 18:56:34,252 INFO sqlalchemy.engine.base.Engine ()
2021-04-19 18:56:34,253 INFO sqlalchemy.engine.base.Engine SELECT sql FROM  (SELECT * FROM sqlite_master UNION ALL   SELECT * FROM sqlite_temp_master) WHERE name = ? AND type = 'table'
2021-04-19 18:56:34,254 INFO sqlalchemy.engine.base.Engine ('Band',)
2021-04-19 18:56:34,257 INFO sqlalchemy.engine.base.Engine PRAGMA main.foreign_key_list("Band")
2021-04-19 18:56:34,258 INFO sqlalchemy.engine.base.Engine ()
2021-04-19 18:56:34,259 INFO sqlalchemy.engine.base.Engine PRAGMA temp.foreign_key_list("Band")
20

Unnamed: 0_level_0,fname,lname,birthyear
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Paul,McCartney,1942
1,John,Lennon,1940


In [55]:
sql = '''SELECT fname, birthyear from Band'''
fnames = pd.read_sql(sql, con)
fnames

Unnamed: 0,fname,birthyear
0,Paul,1942
1,John,1940


### How it work\'s\...

In [56]:
import json
encoded = json.dumps(people)
encoded

'{"first": ["Paul", "John", "Richard", "George"], "last": ["McCartney", "Lennon", "Starkey", "Harrison"], "birth": [1942, 1940, 1940, 1943]}'

In [57]:
json.loads(encoded)

{'first': ['Paul', 'John', 'Richard', 'George'],
 'last': ['McCartney', 'Lennon', 'Starkey', 'Harrison'],
 'birth': [1942, 1940, 1940, 1943]}

### How to do it\...

In [58]:
beatles = pd.read_json(encoded)
beatles

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [59]:
records = beatles.to_json(orient='records')
records

'[{"first":"Paul","last":"McCartney","birth":1942},{"first":"John","last":"Lennon","birth":1940},{"first":"Richard","last":"Starkey","birth":1940},{"first":"George","last":"Harrison","birth":1943}]'

In [60]:
pd.read_json(records, orient='records')

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [61]:
split = beatles.to_json(orient='split')
split

'{"columns":["first","last","birth"],"index":[0,1,2,3],"data":[["Paul","McCartney",1942],["John","Lennon",1940],["Richard","Starkey",1940],["George","Harrison",1943]]}'

In [62]:
pd.read_json(split, orient='split')

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [63]:
index = beatles.to_json(orient='index')
index

'{"0":{"first":"Paul","last":"McCartney","birth":1942},"1":{"first":"John","last":"Lennon","birth":1940},"2":{"first":"Richard","last":"Starkey","birth":1940},"3":{"first":"George","last":"Harrison","birth":1943}}'

In [64]:
pd.read_json(index, orient='index')

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [65]:
values = beatles.to_json(orient='values')
values

'[["Paul","McCartney",1942],["John","Lennon",1940],["Richard","Starkey",1940],["George","Harrison",1943]]'

In [66]:
pd.read_json(values, orient='values')

Unnamed: 0,0,1,2
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [67]:
(pd.read_json(values, orient='values')
   .rename(columns=dict(enumerate(['first', 'last', 'birth'])))
)

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [68]:
table = beatles.to_json(orient='table')
table

'{"schema":{"fields":[{"name":"index","type":"integer"},{"name":"first","type":"string"},{"name":"last","type":"string"},{"name":"birth","type":"integer"}],"primaryKey":["index"],"pandas_version":"0.20.0"},"data":[{"index":0,"first":"Paul","last":"McCartney","birth":1942},{"index":1,"first":"John","last":"Lennon","birth":1940},{"index":2,"first":"Richard","last":"Starkey","birth":1940},{"index":3,"first":"George","last":"Harrison","birth":1943}]}'

In [69]:
pd.read_json(table, orient='table')

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


### How it works\...

### There\'s more\...

In [70]:
output = beat.to_dict()
output

{'fname': {0: 'Paul', 1: 'John'},
 'lname': {0: 'McCartney', 1: 'Lennon'},
 'birthyear': {0: 1942, 1: 1940}}

In [71]:
output['version'] = '0.4.1'
json.dumps(output)

'{"fname": {"0": "Paul", "1": "John"}, "lname": {"0": "McCartney", "1": "Lennon"}, "birthyear": {"0": 1942, "1": 1940}, "version": "0.4.1"}'

### How to do it\...

In [72]:
url ='https://en.wikipedia.org/wiki/The_Beatles_discography'
dfs = pd.read_html(url)
len(dfs)

54

In [73]:
dfs[0]

Unnamed: 0,The Beatles discography,The Beatles discography.1
0,The Beat...,The Beat...
1,Studio a...,23
2,Live albums,5
3,Compilat...,54
4,Video al...,22
5,Music vi...,68
6,EPs,36
7,Singles,63
8,Mash-ups,2
9,Box sets,17


In [74]:
url ='https://en.wikipedia.org/wiki/The_Beatles_discography'
dfs = pd.read_html(url, match='List of studio albums', na_values='—')
len(dfs)

1

In [75]:
dfs[0].columns

MultiIndex([(               'Title',          'Title'),
            (       'Album details',  'Album details'),
            ('Peak chart positions',       'UK[1][2]'),
            ('Peak chart positions',         'AUS[3]'),
            ('Peak chart positions',         'CAN[4]'),
            ('Peak chart positions',         'FRA[5]'),
            ('Peak chart positions',         'GER[6]'),
            ('Peak chart positions',         'NOR[7]'),
            ('Peak chart positions',       'US[8][9]'),
            (      'Certifications', 'Certifications')],
           )

In [76]:
url ='https://en.wikipedia.org/wiki/The_Beatles_discography'
dfs = pd.read_html(url, match='List of studio albums', na_values='—',
    header=[0,1])
len(dfs)

1

In [77]:
dfs[0]

Unnamed: 0_level_0,Title,Album details,...,Peak chart positions,Certifications
Unnamed: 0_level_1,Title,Album details,...,US[8][9],Certifications
0,Please P...,Released...,...,,BPI: Pla...
1,With the...,Released...,...,,BPI: Gol...
2,Introduc...,Released...,...,2,RIAA: Pl...
3,Meet the...,Released...,...,1,MC: Plat...
4,Twist an...,Released...,...,,MC: 3× P...
...,...,...,...,...,...
22,The Beat...,Released...,...,1,BPI: 2× ...
23,Yellow S...,Released...,...,2,BPI: Gol...
24,Abbey Road,Released...,...,1,BPI: 3× ...
25,Let It Be,Released...,...,1,BPI: Pla...


In [78]:
dfs[0].columns

MultiIndex([(               'Title',          'Title'),
            (       'Album details',  'Album details'),
            ('Peak chart positions',       'UK[1][2]'),
            ('Peak chart positions',         'AUS[3]'),
            ('Peak chart positions',         'CAN[4]'),
            ('Peak chart positions',         'FRA[5]'),
            ('Peak chart positions',         'GER[6]'),
            ('Peak chart positions',         'NOR[7]'),
            ('Peak chart positions',       'US[8][9]'),
            (      'Certifications', 'Certifications')],
           )

In [79]:
df = dfs[0]
df.columns = ['Title', 'Release', 'UK', 'AUS', 'CAN', 'FRA', 'GER',
    'NOR', 'US', 'Certifications']
df

Unnamed: 0,Title,Release,...,US,Certifications
0,Please P...,Released...,...,,BPI: Pla...
1,With the...,Released...,...,,BPI: Gol...
2,Introduc...,Released...,...,2,RIAA: Pl...
3,Meet the...,Released...,...,1,MC: Plat...
4,Twist an...,Released...,...,,MC: 3× P...
...,...,...,...,...,...
22,The Beat...,Released...,...,1,BPI: 2× ...
23,Yellow S...,Released...,...,2,BPI: Gol...
24,Abbey Road,Released...,...,1,BPI: 3× ...
25,Let It Be,Released...,...,1,BPI: Pla...


In [80]:
res = (df
  .pipe(lambda df_: df_[~df_.Title.str.startswith('Released')])
  .iloc[:-1]
  .assign(release_date=lambda df_: pd.to_datetime(
             df_.Release.str.extract(r'Released: (.*) Label')
               [0]
               .str.replace(r'\[E\]', '')
          ),
          label=lambda df_:df_.Release.str.extract(r'Label: (.*)')
         )
   .loc[:, ['Title', 'UK', 'AUS', 'CAN', 'FRA', 'GER', 'NOR',
            'US', 'release_date', 'label']]
)
res

Unnamed: 0,Title,UK,...,release_date,label
0,Please P...,1,...,1963-03-22,Parlopho...
1,With the...,1,...,1963-11-22,Parlopho...
2,Introduc...,,...,1964-01-10,Vee-Jay ...
3,Meet the...,,...,1964-01-20,Capitol ...
4,Twist an...,,...,1964-02-03,Capitol ...
...,...,...,...,...,...
21,Magical ...,31,...,1967-11-27,Parlopho...
22,The Beat...,1,...,1968-11-22,Apple
23,Yellow S...,3,...,1969-01-13,Apple (U...
24,Abbey Road,1,...,1969-09-26,Apple


### How it works\...

### There is more\...

In [81]:
url = 'https://github.com/mattharrison/datasets/blob/master/data/anscombes.csv'
dfs = pd.read_html(url, attrs={'class': 'csv-data'})
len(dfs)

1

In [82]:
dfs[0]

Unnamed: 0.1,Unnamed: 0,quadrant,x,y
0,,I,10.0,8.04
1,,I,14.0,9.96
2,,I,6.0,7.24
3,,I,9.0,8.81
4,,I,4.0,4.26
...,...,...,...,...
39,,IV,8.0,6.58
40,,IV,8.0,7.91
41,,IV,8.0,8.47
42,,IV,8.0,5.25
