Numpy - 
  np.array
  np.ndarray
  
Pandas -
  pd.Series
  pd.DataFrame
  
 

# Pandas data structure
* Pandas is built around two collection objects
  * DataFrame
    * Tabular data structure
    * Consists of 
       * columns (each of which is a series)
       * index: name(s) attached to rows
       * Can be accessed as collection of rows and/or columns
  * Series  
    * Similar to a Numpy array
    * Consists of a name, values and *index* 
      * Numpy arrays do not have an index
    * The values and indices are themselves Numpy arrays
      
      

In [1]:
import pandas as pd

# implicit index
series = pd.Series([10, 20, 30])
print(series)
print

# explicit index
series = pd.Series([0, 1, 2, 3], index=["zero", "one", "two", "three"])
print(series)
print

# implicit index
capitals = pd.Series({
      'France': 'Paris',
      'Japan': 'Tokyo', 
      'Germany': 'Berlin'})
print(capitals)

0    10
1    20
2    30
dtype: int64
zero     0
one      1
two      2
three    3
dtype: int64
France      Paris
Japan       Tokyo
Germany    Berlin
dtype: object


In [0]:
import numpy as np
numpy_arr = np.random.rand(3)
#creating 3 random number in a numpy array.

import pandas as pd

pdseries = pd.Series(numpy_arr)
#Saving the numpy array in the pandas series. 
print(pdseries)
type(pdseries)

0    0.534935
1    0.959273
2    0.721365
dtype: float64


pandas.core.series.Series

In [0]:
pd_df = pd.DataFrame(np.random.rand(3,2))
type(pd_df)

pandas.core.frame.DataFrame

In [0]:
series = pd.Series(numpy_arr)
series

0    0.534935
1    0.959273
2    0.721365
dtype: float64

In [0]:
print(numpy_arr)

[0.53493538 0.95927306 0.72136454]


In [0]:
numpy_arr[0]

0.5349353785999251

In [0]:
series[0]

0.5349353785999251

In [0]:
series = pd.Series(numpy_arr, index=['First','Second','Third'])

In [0]:
series['First']

0.5349353785999251

In [0]:
series.First
#avoid

0.5349353785999251

In [0]:
series[0]

0.5349353785999251

In [0]:
series

First     0.534935
Second    0.959273
Third     0.721365
dtype: float64

In [0]:
pd_df[0,1]

KeyError: ignored

In [0]:
pd_df.columns

RangeIndex(start=0, stop=2, step=1)

In [0]:
pd_df

Unnamed: 0,0,1
0,0.305365,0.625116
1,0.752536,0.572952
2,0.363822,0.356349


In [0]:
pd_df.columns = ['First','Second']

In [0]:
pd_df

Unnamed: 0,First,Second
0,0.305365,0.625116
1,0.752536,0.572952
2,0.363822,0.356349


In [0]:
pd_df.Second

0    0.625116
1    0.572952
2    0.356349
Name: Second, dtype: float64

In [0]:
pd_df['Second']

0    0.625116
1    0.572952
2    0.356349
Name: Second, dtype: float64

Since it is only one column here, it is a series, not a data frame. 

Pandas compatible data sources -


*   Text File
    *   CSV
    *   JSON
    *   HTML
*   Binary Files
*   Relational Databases


In [0]:
df = pd.read_csv('artwork_data.csv', nrows=5)

In [0]:
df.head()

Unnamed: 0,id,accession_number,artist,artistRole,artistId,title,dateText,medium,creditLine,year,acquisitionYear,dimensions,width,height,depth,units,inscription,thumbnailCopyright,thumbnailUrl,url
0,1035,A00001,"Blake, Robert",artist,38,A Figure Bowing before a Seated Old Man with h...,date not known,"Watercolour, ink, chalk and graphite on paper....",Presented by Mrs John Richmond 1922,,1922,support: 394 x 419 mm,394,419,,mm,,,http://www.tate.org.uk/art/images/work/A/A00/A...,http://www.tate.org.uk/art/artworks/blake-a-fi...
1,1036,A00002,"Blake, Robert",artist,38,"Two Drawings of Frightened Figures, Probably f...",date not known,Graphite on paper,Presented by Mrs John Richmond 1922,,1922,support: 311 x 213 mm,311,213,,mm,,,http://www.tate.org.uk/art/images/work/A/A00/A...,http://www.tate.org.uk/art/artworks/blake-two-...
2,1037,A00003,"Blake, Robert",artist,38,The Preaching of Warning. Verso: An Old Man En...,?c.1785,Graphite on paper. Verso: graphite on paper,Presented by Mrs John Richmond 1922,1785.0,1922,support: 343 x 467 mm,343,467,,mm,,,http://www.tate.org.uk/art/images/work/A/A00/A...,http://www.tate.org.uk/art/artworks/blake-the-...
3,1038,A00004,"Blake, Robert",artist,38,Six Drawings of Figures with Outstretched Arms,date not known,Graphite on paper,Presented by Mrs John Richmond 1922,,1922,support: 318 x 394 mm,318,394,,mm,,,http://www.tate.org.uk/art/images/work/A/A00/A...,http://www.tate.org.uk/art/artworks/blake-six-...
4,1039,A00005,"Blake, William",artist,39,The Circle of the Lustful: Francesca da Rimini...,"1826–7, reprinted 1892",Line engraving on paper,Purchased with the assistance of a special gra...,1826.0,1919,image: 243 x 335 mm,243,335,,mm,,,http://www.tate.org.uk/art/images/work/A/A00/A...,http://www.tate.org.uk/art/artworks/blake-the-...


In [0]:
df = pd.read_csv('artwork_data.csv', nrows=5, index_col='id', usecols=['id','artist'])

In [0]:
df

Unnamed: 0_level_0,artist
id,Unnamed: 1_level_1
1035,"Blake, Robert"
1036,"Blake, Robert"
1037,"Blake, Robert"
1038,"Blake, Robert"
1039,"Blake, William"


In [0]:
cols_to_use = ['id','artist', 'title','medium',	'year',	'acquisitionYear', 
               'width',	'height', 'units']

In [0]:
df = pd.read_csv('artwork_data.csv', usecols=cols_to_use, index_col='id', low_memory=False)
#selected few columns from the csv using usecols and set low memory = false so that the big data could be loaded.
#set the index column to the id field.

In [0]:
df.shape

(69201, 8)

In [0]:
df.head()

Unnamed: 0_level_0,artist,title,medium,year,acquisitionYear,width,height,units
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1035,"Blake, Robert",A Figure Bowing before a Seated Old Man with h...,"Watercolour, ink, chalk and graphite on paper....",,1922.0,394,419,mm
1036,"Blake, Robert","Two Drawings of Frightened Figures, Probably f...",Graphite on paper,,1922.0,311,213,mm
1037,"Blake, Robert",The Preaching of Warning. Verso: An Old Man En...,Graphite on paper. Verso: graphite on paper,1785.0,1922.0,343,467,mm
1038,"Blake, Robert",Six Drawings of Figures with Outstretched Arms,Graphite on paper,,1922.0,318,394,mm
1039,"Blake, William",The Circle of the Lustful: Francesca da Rimini...,Line engraving on paper,1826.0,1919.0,243,335,mm


In [0]:
len(set(df['artist']))

3336

How many distinct artists are there in the dataset?

How many artworks by Francis Bacon are there?

What is the artwork with the biggest dimensions? 

In [0]:
df['artist']
#this will give us a series

id
1035                                          Blake, Robert
1036                                          Blake, Robert
1037                                          Blake, Robert
1038                                          Blake, Robert
1039                                         Blake, William
1040                                         Blake, William
1041                                         Blake, William
1042                                         Blake, William
1043                                         Blake, William
1044                                         Blake, William
1045                                         Blake, William
1046                                         Blake, William
1047                                         Blake, William
1048                                         Blake, William
1049                                         Blake, William
1050                                         Blake, William
1051                                 

In [0]:
df[['artist','title']]
#this will give us a dataset

Unnamed: 0_level_0,artist,title
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1035,"Blake, Robert",A Figure Bowing before a Seated Old Man with h...
1036,"Blake, Robert","Two Drawings of Frightened Figures, Probably f..."
1037,"Blake, Robert",The Preaching of Warning. Verso: An Old Man En...
1038,"Blake, Robert",Six Drawings of Figures with Outstretched Arms
1039,"Blake, William",The Circle of the Lustful: Francesca da Rimini...
1040,"Blake, William",Ciampolo the Barrator Tormented by the Devils
1041,"Blake, William",The Baffled Devils Fighting
1042,"Blake, William",The Six-Footed Serpent Attacking Agnolo Brunel...
1043,"Blake, William",The Serpent Attacking Buoso Donati
1044,"Blake, William",The Pit of Disease: The Falsifiers


In [0]:
#Count the number of artists in our dataset
len(set(df['artist']))

3336

In [0]:
#pandas way -
uni = pd.unique(df['artist'])
print(uni)

['Blake, Robert' 'Blake, William' 'Richmond, George' ... 'Sterne, Hedda'
 'P-Orridge, Genesis' 'Brunias, Agostino']


In [0]:
len(uni)

3336

In [0]:
s = df['artist'] == 'Bacon, Francis'
s.value_counts()

False    69151
True        50
Name: artist, dtype: int64

loc by label. 

iloc by position.

df.loc[row indexer , column indexer ]

df.loc['35, 'artist' ]

df.loc[df['artist'] == 'Bacon, Francis' , : ]

the above statement returns a pandas series for the row indexer with true false in it and the : says that select all columns.

df.iloc[ 100:300 , [0,1,4] ]
df.iloc[ : , [1,3,4] ]

in iloc we cannot use the labels. we have to provide the positions instead.

In [0]:
df.loc[1035,'artist']

'Blake, Robert'

In [0]:
df.iloc[0,0]

'Blake, Robert'

In [0]:
df.iloc[0:2,0:2]

Unnamed: 0_level_0,artist,title
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1035,"Blake, Robert",A Figure Bowing before a Seated Old Man with h...
1036,"Blake, Robert","Two Drawings of Frightened Figures, Probably f..."


In [0]:
# find the artwork with the biggest area/dimensions

In [0]:
df.head()

Unnamed: 0_level_0,artist,title,medium,year,acquisitionYear,width,height,units
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1035,"Blake, Robert",A Figure Bowing before a Seated Old Man with h...,"Watercolour, ink, chalk and graphite on paper....",,1922.0,394,419,mm
1036,"Blake, Robert","Two Drawings of Frightened Figures, Probably f...",Graphite on paper,,1922.0,311,213,mm
1037,"Blake, Robert",The Preaching of Warning. Verso: An Old Man En...,Graphite on paper. Verso: graphite on paper,1785.0,1922.0,343,467,mm
1038,"Blake, Robert",Six Drawings of Figures with Outstretched Arms,Graphite on paper,,1922.0,318,394,mm
1039,"Blake, William",The Circle of the Lustful: Francesca da Rimini...,Line engraving on paper,1826.0,1919.0,243,335,mm


In [0]:
df.columns

Index(['artist', 'title', 'medium', 'year', 'acquisitionYear', 'width',
       'height', 'units'],
      dtype='object')

In [0]:
df['height'] * df['width']
#we will get error for this line as they are object types, not value types

TypeError: ignored

In [0]:
type(df['height'])

pandas.core.series.Series

In [0]:
type(df['width'])

pandas.core.series.Series

In [0]:
df['width']

id
1035       394
1036       311
1037       343
1038       318
1039       243
1040       240
1041       242
1042       246
1043       241
1044       243
1045       236
1046       184
1047       197
1048       197
1049       200
1050       198
1051       198
1052       198
1053       199
1054       198
1055       198
1056       197
1057       200
1058       198
1059       191
1060       200
1061       186
1062       200
1063       198
1064       197
          ... 
121196     558
121197     380
121198     560
115637     NaN
115638     NaN
122545    2093
120104     NaN
121565     NaN
116157     NaN
117761     NaN
115217     NaN
122537     NaN
120527     NaN
127128     368
127129     705
127130     408
127131     410
121283     NaN
129068      57
117863     NaN
120549     NaN
123620     810
122900     NaN
122958     305
122959     305
122960     305
122961     305
121181      45
112306     NaN
127035     508
Name: width, Length: 69201, dtype: object

In [0]:
#The problem is that our input data is dirty. Let's check what is wrong. 
df['width'].sort_values().head()
#these contains strings...

id
20822            (1):
105337    (diameter):
98671         (each):
76420         (each):
91391        (image):
Name: width, dtype: object

In [0]:
df['width'].sort_values().tail()
#NaN stands for Not a Number. These are originally missing values.

id
121283    NaN
117863    NaN
120549    NaN
122900    NaN
112306    NaN
Name: width, dtype: object

In [0]:
df.loc[: , 'width'] = pd.to_numeric(df['width'], errors='coerce')
#coerce is used to remove all errors and still convert to numeric type.
#look at the dtype now. Float64 while it was object previously. 

In [0]:
df['width'].sort_values().head()

id
9453      3.0
21771     5.0
21770     9.0
8976     10.0
509      10.0
Name: width, dtype: float64

In [0]:
#repeat for height column

df.loc[: , 'height'] = pd.to_numeric(df['height'], errors='coerce')

In [0]:
df['height'] * df['width']

id
1035       165086.0
1036        66243.0
1037       160181.0
1038       125292.0
1039        81405.0
1040        81120.0
1041        80828.0
1042        83640.0
1043        80735.0
1044        82620.0
1045        80240.0
1046        27600.0
1047        29747.0
1048        30141.0
1049        30400.0
1050        30096.0
1051        30294.0
1052        30294.0
1053        29850.0
1054        30096.0
1055        30096.0
1056        29944.0
1057        30200.0
1058        29898.0
1059        28650.0
1060        30200.0
1061        27900.0
1062        30200.0
1063        29700.0
1064        29550.0
            ...    
121196     424080.0
121197     215080.0
121198     426720.0
115637          NaN
115638          NaN
122545    3514147.0
120104          NaN
121565          NaN
116157          NaN
117761          NaN
115217          NaN
122537          NaN
120527          NaN
127128      98256.0
127129     331350.0
127130      83640.0
127131     129970.0
121283          NaN
129068       3249

In [0]:
#Assign to create a new column in your dataset
df = df.assign(area = df['height'] * df['width'])

In [0]:
df.head()

Unnamed: 0_level_0,artist,title,medium,year,acquisitionYear,width,height,units,area
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1035,"Blake, Robert",A Figure Bowing before a Seated Old Man with h...,"Watercolour, ink, chalk and graphite on paper....",,1922.0,394.0,419.0,mm,165086.0
1036,"Blake, Robert","Two Drawings of Frightened Figures, Probably f...",Graphite on paper,,1922.0,311.0,213.0,mm,66243.0
1037,"Blake, Robert",The Preaching of Warning. Verso: An Old Man En...,Graphite on paper. Verso: graphite on paper,1785.0,1922.0,343.0,467.0,mm,160181.0
1038,"Blake, Robert",Six Drawings of Figures with Outstretched Arms,Graphite on paper,,1922.0,318.0,394.0,mm,125292.0
1039,"Blake, William",The Circle of the Lustful: Francesca da Rimini...,Line engraving on paper,1826.0,1919.0,243.0,335.0,mm,81405.0


In [0]:
df['area'].max()

132462000.0

In [0]:
df['area'].idxmax()

98367

In [0]:
type(df.loc[df['area'].idxmax(), : ])

pandas.core.series.Series

##Groups

Aggregation : Example -
What was the first acquired artwork for each artist?

Transformation : Missing Values - Nan
Remove the row
replace value with mean/median/mode.

Filtering : Select only those rows that matter to us. Data of united states in countries.csv dataset.


In [0]:
df.groupby('artist')

<pandas.core.groupby.DataFrameGroupBy object at 0x7f7c62f628d0>

In [0]:
#making a smaller slice
small_df = df.iloc[49980 : 50019,  : ].copy()

In [0]:
small_df.shape

(39, 8)

In [0]:
small_df.head()

Unnamed: 0_level_0,artist,title,medium,year,acquisitionYear,width,height,units
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
16412,Wols,[no title],Etching on paper,1937,1983.0,130,80,mm
16413,Wols,[no title],Etching and drypoint on paper,1937,1983.0,187,102,mm
16414,Wols,[no title],Etching and drypoint on paper,1937,1983.0,248,168,mm
16415,Wols,[no title],Etching and drypoint on paper,1937,1983.0,149,102,mm
16416,Wols,[no title],Etching and drypoint on paper,1937,1983.0,203,120,mm


In [0]:
grouped = small_df.groupby('artist')

In [0]:
for name, group_df in grouped:
  print(name)
  print(group_df)
  break

Frost, Sir Terry
                artist            title               medium  year  \
id                                                                   
4704  Frost, Sir Terry        Blue Moon  Lithograph on paper  1952   
4705  Frost, Sir Terry      Boat Shapes     Linocut on paper  1952   
4706  Frost, Sir Terry      Boat Shapes     Linocut on paper  1954   
4707  Frost, Sir Terry      Boat Shapes     Linocut on paper  1954   
4708  Frost, Sir Terry            Leeds    Drypoint on paper  1956   
4709  Frost, Sir Terry  Camping, Anduze     Etching on paper  1979   
4710  Frost, Sir Terry     Umea, Sweden     Etching on paper  1979   
4711  Frost, Sir Terry    Self-Portrait     Etching on paper  1980   

      acquisitionYear  width  height units     area  
id                                                   
4704           1983.0  355.0   273.0    mm  96915.0  
4705           1983.0  132.0   143.0    mm  18876.0  
4706           1983.0  131.0   155.0    mm  20305.0  
4707        

In [0]:
#Aggregate
#Mins
for name, group_df in small_df.groupby('artist') :
  min_year = group_df['acquisitionYear'].min()
  print("{}: {}".format(name, min_year))
  


Frost, Sir Terry: 1983.0
Phillips, Esq Tom: 1983.0
Wols: 1983.0


In [0]:
#df.groupby('artist').agg(np.min)
df.groupby('artist').min()

Unnamed: 0_level_0,title,acquisitionYear
artist,Unnamed: 1_level_1,Unnamed: 2_level_1
?British School,"Portrait of a Gentleman, probably of the West ...",1927.0
"Abakanowicz, Magdalena",Abakan Orange,2009.0
"Abbey, Edwin Austin",Illustration to ‘Judith Shakespeare’,1924.0
"Abbott, Berenice",Dinty Moore Antiques,2010.0
"Abbott, Lemuel Francis","Henry Byne, of Carshalton",1885.0
"Abrahams, Ivor",A Dream Within a Dream,1975.0
Absalon,Assassinations,1997.0
"Abts, Tomma",Noeme,2006.0
"Acconci, Vito",3 Flags for 1 Space and 6 Regions,1982.0
"Ackling, Roger",Five Sunsets in One Hour,1983.0


In [0]:
#Transformation
#Equivalent of editing by hand:
#Make a case where there is no data to infer
small_df.loc[[11838,16441], 'medium'] = np.nan


Filtering :

Keep the books which has titles atleast 2 times in the dataset. 

In [0]:
grouped_titles = df.groupby('title')
title_counts = grouped_titles.size().sort_values(ascending = False)

condition = lambda x: len(x.index) > 1
dup_titles_df = grouped_titles.filter(condition)
dup_titles_df.sort_values('title', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [0]:
dup_titles_df.head()

Unnamed: 0_level_0,artist,title,medium,year,acquisitionYear,width,height,units
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
20571,"Scully, Sean",#1,Aquatint on paper,1992,1992.0,117,174,mm
21345,"Dunham, Carroll",#1,Drypoint on paper,1989,2002.0,392,579,mm
21346,"Dunham, Carroll",#2,Drypoint on paper,1989,2002.0,392,579,mm
20572,"Scully, Sean",#2,Aquatint on paper,1992,1992.0,323,278,mm
21347,"Dunham, Carroll",#3,Drypoint on paper,1989,2002.0,392,579,mm
