In [133]:
import pandas as pd
import numpy as np
df = pd.read_csv("BL-Flickr-Images-Book.csv")
type(df)


pandas.core.frame.DataFrame

In [134]:
type(df)

pandas.core.frame.DataFrame

In [125]:
#check rows and columns in dataframe
df.shape


(8287, 15)

In [144]:
df.head()

Unnamed: 0,Identifier,Place of Publication,Date of Publication,Publisher,Title,Author,Flickr URL
0,206,London,1879 [1878],S. Tinsley & Co.,Walter Forbes. [A novel.] By A. A,A. A.,http://www.flickr.com/photos/britishlibrary/ta...
1,216,London; Virtue & Yorston,1868,Virtue & Co.,All for Greed. [A novel. The dedication signed...,"A., A. A.",http://www.flickr.com/photos/britishlibrary/ta...
2,218,London,1869,"Bradbury, Evans & Co.",Love the Avenger. By the author of “All for Gr...,"A., A. A.",http://www.flickr.com/photos/britishlibrary/ta...
3,472,London,1851,James Darling,"Welsh Sketches, chiefly ecclesiastical, to the...","A., E. S.",http://www.flickr.com/photos/britishlibrary/ta...
4,480,London,1857,Wertheim & Macintosh,"[The World in which I live, and my place in it...","A., E. S.",http://www.flickr.com/photos/britishlibrary/ta...


In [136]:
#check percentage of missing values in each column
df.isnull().mean().round(4).mul(100).sort_values(ascending=False)

Engraver                  100.00
Corporate Contributors    100.00
Corporate Author          100.00
Former owner               99.99
Edition Statement          90.67
Publisher                  50.62
Author                     21.46
Date of Publication         2.18
Shelfmarks                  0.00
Flickr URL                  0.00
Issuance type               0.00
Contributors                0.00
Title                       0.00
Place of Publication        0.00
Identifier                  0.00
dtype: float64

In [142]:
#Drop columns which has more than 90% missing values and 
#other columns which are not helpful for analysis(as per business)
to_drop = ['Engraver','Corporate Contributors','Corporate Author','Former owner','Edition Statement','Contributors','Issuance type','Shelfmarks']
df.drop(to_drop, inplace= True,axis=1)

In [140]:
# we will be checking date of publication column 
# as it contains other values too and date should be only one
df.loc[1905:, 'Date of Publication'].head(10)

1905         1879
1906         1886
1907         1844
1908    1861-1909
1909         1884
1910         1885
1911         1834
1912         1837
1913         1867
1914         1830
Name: Date of Publication, dtype: object

In [141]:
'''Therefore, we need to do the following:

* Remove the extra dates in square brackets, wherever present: 1879 [1878]
* Convert date ranges to their “start date”, wherever present: 1860-63; 1839, 38-54
* Completely remove the dates we are not certain about and replace them with NumPy’s NaN: [1897?]
* Convert the string nan to NumPy’s NaN value'''

extractDate = df['Date of Publication'].str.extract(r'^(\d{4})' , expand = False)
extractDate.head()

0    1879
1    1868
2    1869
3    1851
4    1857
Name: Date of Publication, dtype: object

In [42]:
# we would assign it to dataframe corresponding column 
df['Date of Publication'] = extractDate

#checking how much are null values in column , its negligible so we can ignore it for now.
df['Date of Publication'].isnull().mean()

#df['Date of Publication'] 


0.11717147339205986

In [50]:
#To clean Place of Publication since this column has string objects we will use  np.where
#Here are the contents of the column:
df['Place of Publication'].head(10)



Identifier
206                                  London
216                London; Virtue & Yorston
218                                  London
472                                  London
480                                  London
481                                  London
519                                  London
667     pp. 40. G. Bryan & Co: Oxford, 1898
874                                 London]
1143                                 London
Name: Place of Publication, dtype: object

In [47]:
pub = df['Place of Publication']
london = pub.str.contains('London')
london[:5]


Identifier
206    True
216    True
218    True
472    True
480    True
Name: Place of Publication, dtype: bool

In [51]:
#check below entries- both books are published on same place but one contains hyphen in name - 
df.loc[4157862]

Edition Statement                                                     NaN
Place of Publication                                  Newcastle-upon-Tyne
Date of Publication                                                  1867
Publisher                                                      T. Fordyce
Title                   Local Records; or, Historical Register of rema...
Author                      FORDYCE, T. - Printer, of Newcastle-upon-Tyne
Contributors             SYKES, John - Bookseller, of Newcastle-upon-Tyne
Former owner                                                          NaN
Issuance type                                                 monographic
Flickr URL              http://www.flickr.com/photos/britishlibrary/ta...
Shelfmarks              British Library HMNTS|British Library HMNTS 01...
Name: 4157862, dtype: object

In [52]:
df.loc[4159587]

Edition Statement                                                     NaN
Place of Publication                                  Newcastle upon Tyne
Date of Publication                                                  1834
Publisher                                                Mackenzie & Dent
Title                   An historical, topographical and descriptive v...
Author                                              Mackenzie, E. (Eneas)
Contributors                                         ROSS, M. - of Durham
Former owner                                                          NaN
Issuance type                                                 monographic
Flickr URL              http://www.flickr.com/photos/britishlibrary/ta...
Shelfmarks              British Library HMNTS|British Library HMNTS 10...
Name: 4159587, dtype: object

In [58]:
#Using np.where function
df['Place of Publication'] = np.where(london, 'London',
                                      np.where(oxford, 'Oxford',
                                               pub.str.replace('-', ' ')))

In [63]:
df['Place of Publication'].head(30)

0          London
1          London
2          London
3          London
4          London
5          London
6          London
7          Oxford
8          London
9          London
10       Coventry
11    Christiania
12        Firenze
13      Amsterdam
14         Savona
15         London
16          Paris
17          Paris
18    Puerto Rico
19       New York
20           Hull
21         London
22         Oxonii
23         London
24         London
25         London
26         Milano
27         London
28       Aberdeen
29           Wien
Name: Place of Publication, dtype: object

In [143]:
df.head()

Unnamed: 0,Identifier,Place of Publication,Date of Publication,Publisher,Title,Author,Flickr URL
0,206,London,1879 [1878],S. Tinsley & Co.,Walter Forbes. [A novel.] By A. A,A. A.,http://www.flickr.com/photos/britishlibrary/ta...
1,216,London; Virtue & Yorston,1868,Virtue & Co.,All for Greed. [A novel. The dedication signed...,"A., A. A.",http://www.flickr.com/photos/britishlibrary/ta...
2,218,London,1869,"Bradbury, Evans & Co.",Love the Avenger. By the author of “All for Gr...,"A., A. A.",http://www.flickr.com/photos/britishlibrary/ta...
3,472,London,1851,James Darling,"Welsh Sketches, chiefly ecclesiastical, to the...","A., E. S.",http://www.flickr.com/photos/britishlibrary/ta...
4,480,London,1857,Wertheim & Macintosh,"[The World in which I live, and my place in it...","A., E. S.",http://www.flickr.com/photos/britishlibrary/ta...
