# Dataframe an Series

In [1]:
import math
import collections

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as pp

%matplotlib inline

In [2]:
pd.options.display.max_rows =  20

In [3]:
nobels = pd.read_csv('nobels.csv', names = ['year', 'discipline', 'nobelist'])

In [4]:
nobels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 950 entries, 0 to 949
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   year        950 non-null    int64 
 1   discipline  950 non-null    object
 2   nobelist    950 non-null    object
dtypes: int64(1), object(2)
memory usage: 22.4+ KB


In [5]:
nobels.head() # first 5 rows

Unnamed: 0,year,discipline,nobelist
0,1901,Chemistry,Jacobus Henricus van 't Hoff
1,1901,Literature,Sully Prudhomme
2,1901,Medicine,Emil Adolf von Behring
3,1901,Peace,Frédéric Passy
4,1901,Peace,Henry Dunant


In [6]:
nobels.tail() # last 5 rows

Unnamed: 0,year,discipline,nobelist
945,2019,Medicine,William Kaelin Jr.
946,2019,Peace,Abiy Ahmed
947,2019,Physics,Didier Queloz
948,2019,Physics,James Peebles
949,2019,Physics,Michel Mayor


In [7]:
len(nobels)

950

In [8]:
nobels.columns

Index(['year', 'discipline', 'nobelist'], dtype='object')

In [9]:
nobels.dtypes

year           int64
discipline    object
nobelist      object
dtype: object

In [10]:
nobels.index

RangeIndex(start=0, stop=950, step=1)

In [11]:
nobels['discipline'] # returns a series

0       Chemistry
1      Literature
2        Medicine
3           Peace
4           Peace
          ...    
945      Medicine
946         Peace
947       Physics
948       Physics
949       Physics
Name: discipline, Length: 950, dtype: object

In [12]:
nobels.nobelist # returns a series

0      Jacobus Henricus van 't Hoff
1                   Sully Prudhomme
2            Emil Adolf von Behring
3                    Frédéric Passy
4                      Henry Dunant
                   ...             
945              William Kaelin Jr.
946                      Abiy Ahmed
947                   Didier Queloz
948                   James Peebles
949                    Michel Mayor
Name: nobelist, Length: 950, dtype: object

In [13]:
nobels.discipline.values[:50] # will return NumPy array

array(['Chemistry', 'Literature', 'Medicine', 'Peace', 'Peace', 'Physics',
       'Chemistry', 'Literature', 'Medicine', 'Peace', 'Peace', 'Physics',
       'Physics', 'Chemistry', 'Literature', 'Medicine', 'Peace',
       'Physics', 'Physics', 'Physics', 'Chemistry', 'Literature',
       'Literature', 'Medicine', 'Peace', 'Physics', 'Chemistry',
       'Literature', 'Medicine', 'Peace', 'Physics', 'Chemistry',
       'Literature', 'Medicine', 'Medicine', 'Peace', 'Physics',
       'Chemistry', 'Literature', 'Medicine', 'Peace', 'Peace', 'Physics',
       'Chemistry', 'Literature', 'Medicine', 'Medicine', 'Peace',
       'Peace', 'Physics'], dtype=object)

In [14]:
nobels.discipline.unique() # also array

array(['Chemistry', 'Literature', 'Medicine', 'Peace', 'Physics',
       'Economics'], dtype=object)

In [15]:
nobels.nobelist.value_counts()

International Committee of the Red Cross         3
John Bardeen                                     2
United Nations High Commissioner for Refugees    2
Marie Curie                                      2
Frederick Sanger                                 2
                                                ..
Derek Walcott                                    1
Fritz Albert Lipmann                             1
Albert Abraham Michelson                         1
Hans von Euler-Chelpin                           1
Christian de Duve                                1
Name: nobelist, Length: 943, dtype: int64

In [16]:
# fancy indexing
nobels[nobels.discipline == 'Physics']

Unnamed: 0,year,discipline,nobelist
5,1901,Physics,Wilhelm Röntgen
11,1902,Physics,Hendrik Lorentz
12,1902,Physics,Pieter Zeeman
17,1903,Physics,Henri Becquerel
18,1903,Physics,Marie Curie
...,...,...,...
934,2018,Physics,Donna Strickland
935,2018,Physics,Gérard Mourou
947,2019,Physics,Didier Queloz
948,2019,Physics,James Peebles


In [17]:
# or the other way
nobels.query('discipline == "Chemistry"')

Unnamed: 0,year,discipline,nobelist
0,1901,Chemistry,Jacobus Henricus van 't Hoff
6,1902,Chemistry,Hermann Emil Fischer
13,1903,Chemistry,Svante Arrhenius
20,1904,Chemistry,William Ramsay
26,1905,Chemistry,Adolf von Baeyer
...,...,...,...
924,2018,Chemistry,George P. Smith
925,2018,Chemistry,Greg Winter
936,2019,Chemistry,Akira Yoshino
937,2019,Chemistry,John B. Goodenough


In [18]:
nobels['Curie' in nobels.nobelist] # will raise errors

KeyError: False

In [23]:
# the right way is to use string method
nobels[nobels.nobelist.str.contains('Curie')]

Unnamed: 0,year,discipline,nobelist
18,1903,Physics,Marie Curie
19,1903,Physics,Pierre Curie
62,1911,Chemistry,Marie Curie
178,1935,Chemistry,Frédéric Joliot-Curie
179,1935,Chemistry,Irène Joliot-Curie


In [24]:
disco = np.load('discography.npy')

In [25]:
disco

array([('David Bowie', '1969-11-14', 17),
       ('The Man Who Sold the World', '1970-11-04',  3),
       ('Hunky Dory', '1971-12-17',  5),
       ('Ziggy Stardust', '1972-06-16',  1),
       ('Aladdin Sane', '1973-04-13',  1), ('Pin Ups', '1973-10-19',  1),
       ('Diamond Dogs', '1974-05-24',  1),
       ('Young Americans', '1975-03-07',  2),
       ('Station To Station', '1976-01-23',  5),
       ('Low', '1977-01-14',  2), ('Heroes', '1977-10-14',  3),
       ('Lodger', '1979-05-18',  4)],
      dtype=[('title', '<U32'), ('release', '<M8[D]'), ('toprank', '<i8')])

In [26]:
disco_df = pd.DataFrame(disco)

In [27]:
disco_df

Unnamed: 0,title,release,toprank
0,David Bowie,1969-11-14,17
1,The Man Who Sold the World,1970-11-04,3
2,Hunky Dory,1971-12-17,5
3,Ziggy Stardust,1972-06-16,1
4,Aladdin Sane,1973-04-13,1
5,Pin Ups,1973-10-19,1
6,Diamond Dogs,1974-05-24,1
7,Young Americans,1975-03-07,2
8,Station To Station,1976-01-23,5
9,Low,1977-01-14,2


In [28]:
disco_df.dtypes

title              object
release    datetime64[ns]
toprank             int64
dtype: object

In [29]:
pd.DataFrame([{'title': 'David Bowie', 'year': 1969},
              {'title': 'The Man Who Sold the World', 'year': 1970},
              {'title': 'Hunky Dory', 'year': 1971}])

Unnamed: 0,title,year
0,David Bowie,1969
1,The Man Who Sold the World,1970
2,Hunky Dory,1971


In [30]:
pd.DataFrame([('Ziggy Stardust', 1), ('Aladdin Sane', 1), ('Pin Ups', 1)], columns=['title','toprank'])

Unnamed: 0,title,toprank
0,Ziggy Stardust,1
1,Aladdin Sane,1
2,Pin Ups,1


In [32]:
disco['title'], disco['release']

(array(['David Bowie', 'The Man Who Sold the World', 'Hunky Dory',
        'Ziggy Stardust', 'Aladdin Sane', 'Pin Ups', 'Diamond Dogs',
        'Young Americans', 'Station To Station', 'Low', 'Heroes', 'Lodger'],
       dtype='<U32'),
 array(['1969-11-14', '1970-11-04', '1971-12-17', '1972-06-16',
        '1973-04-13', '1973-10-19', '1974-05-24', '1975-03-07',
        '1976-01-23', '1977-01-14', '1977-10-14', '1979-05-18'],
       dtype='datetime64[D]'))

In [31]:
pd.DataFrame({'title': ['David Bowie', 'The Man Who Sold the World', 'Hunky Dory',
                        'Ziggy Stardust', 'Aladdin Sane', 'Pin Ups', 'Diamond Dogs',
                        'Young Americans', 'Station To Station', 'Low', 'Heroes', 'Lodger'],
              'release': ['1969-11-14', '1970-11-04', '1971-12-17', '1972-06-16',
                          '1973-04-13', '1973-10-19', '1974-05-24', '1975-03-07',
                          '1976-01-23', '1977-01-14', '1977-10-14', '1979-05-18']})

Unnamed: 0,title,release
0,David Bowie,1969-11-14
1,The Man Who Sold the World,1970-11-04
2,Hunky Dory,1971-12-17
3,Ziggy Stardust,1972-06-16
4,Aladdin Sane,1973-04-13
5,Pin Ups,1973-10-19
6,Diamond Dogs,1974-05-24
7,Young Americans,1975-03-07
8,Station To Station,1976-01-23
9,Low,1977-01-14
