In [1]:
import pandas as pd

In [2]:
library_df = pd.read_csv('./datasets/library-collection-inventory/library-collection-inventory.csv')

In [7]:
collection_df = pd.read_excel('./datasets/library-collection-inventory/CollectionInventory_Codes_EXCLUDED_INCLUDED.xlsx')

In [8]:
type_df = pd.read_excel('./datasets/library-collection-inventory/CollectionInventory_Codes_EXCLUDED_INCLUDED.xlsx', sheet_name=1)

In [9]:
## Master Data Cleaning

In [10]:
# Only Keep Collection of Books
collection_df = collection_df[collection_df.FormatSubgroup == 'Book']

In [11]:
# Only Keep Books Types
type_df = type_df[type_df.FormatSubgroup == 'Book']

## Data Cleansing

In [3]:
# Create dataframe for unique books only (dorpping date and item count)
books_df = library_df.drop(columns=['ReportDate', 'ItemCount']).drop_duplicates().copy()

In [4]:
# Drop item location
books_df.drop(columns=['ItemLocation'], inplace=True)
books_df.drop_duplicates(inplace=True)

In [6]:
# Drop NaN ISBN, Author, Title
books_df.dropna(subset=['ISBN'], inplace=True)

In [24]:
# Drop Floating Item (since it's not relevant)
books_df.drop(columns=['FloatingItem'], inplace=True)
books_df.drop_duplicates(inplace=True)

In [33]:
# impute NaN values as empty string
books_df.fillna('', inplace=True)

# Drop Item collection (since it is changing)
books_df.drop(columns=['ItemCollection', 'FloatingItem'], inplace=True)
books_df.drop_duplicates(inplace=True)

In [13]:
# Only Keep books that have a valid collection id (based on the book types)
books_df = books_df[books_df.ItemCollection.isin(collection_df.Code)]

In [14]:
# Only keep books that have a valid type id (based on book types)
books_df = books_df[books_df.ItemType.isin(type_df.Code)]

# Drop books with weird publication year (does not contain 4 digits number)
library_df = library_df[library_df.PublicationYear.str.match(r'.*\d{4}.*')]

# Drop books with more than 4 ISBN values
library_df = library_df[library_df.ISBN.str.split(',').str.len() <= 4].copy()

# Split the ISBN values into four columns (ISBN#)
library_df[['ISBN1', 'ISBN2', 'ISBN3', 'ISBN4']] = library_df.ISBN.str.split(',', n=3, expand=True)

# Extract the first 4 digit number from publication year, and convert it to integer (ignore multiple years and characters)
library_df['PublicationYear'] = library_df.PublicationYear.str.replace(r'^.*?(\d{4}).*$', r'\1', regex=True).astype(int)

library_df = library_df[library_df.PublicationYear >= 1000]

In [17]:
# Checkpoint
books_df.to_pickle('./books.pkl')

# Removing duplicated records
library_df = library_df.drop_duplicates(subset=['BibNum'], keep='last')

In [None]:
# Remove duplicates based on BibNum
# - 

In [19]:
books_df.shape

(1050667, 10)

In [20]:
books_df.BibNum.nunique()

428624

In [26]:
books_df.BibNum.value_counts().head(10)

BibNum
493956     23
3297643    17
2647086    17
3272389    17
1959170    17
2749423    16
3263224    15
3346750    15
2932160    15
3228858    15
Name: count, dtype: int64

In [27]:
books_df[books_df.BibNum == 493956]

Unnamed: 0,BibNum,Title,Author,ISBN,PublicationYear,Publisher,Subjects,ItemType,ItemCollection
163591,493956,Historical atlas of Washington / by James W. S...,"Scott, James William, 1925-",806121084,c1988.,"University of Oklahoma Press,","Washington State Historical geography, Washing...",arbk,naatlr
211589,493956,Historical atlas of Washington / by James W. S...,"Scott, James William, 1925-",806121084,c1988.,"University of Oklahoma Press,","Washington State Historical geography, Washing...",acbk,naatlas
256861,493956,Historical atlas of Washington / by James W. S...,"Scott, James William, 1925-",806121084,c1988.,"University of Oklahoma Press,","Washington State Historical geography, Washing...",arbk,naref
364443,493956,Historical atlas of Washington / by James W. S...,"Scott, James William, 1925-",806121084,c1988.,"University of Oklahoma Press,","Washington State Historical geography, Washing...",acbk,canf
598237,493956,Historical atlas of Washington / by James W. S...,"Scott, James William, 1925-",806121084,c1988.,"University of Oklahoma Press,","Washington State Historical geography, Washing...",acbk,naatlr
1036230,493956,Historical atlas of Washington / by James W. S...,"Scott, James William, 1925-",806121084,c1988.,"University of Oklahoma Press,","Washington State Historical geography, Washing...",acbk,naover
1165747,493956,Historical atlas of Washington / by James W. S...,"Scott, James William, 1925-",806121084,c1988.,"University of Oklahoma Press,","Washington State Historical geography, Washing...",arbk,camapr
4177623,493956,Historical atlas of Washington / by James W. S...,"Scott, James William, 1925-",806121084,[1988],"University of Oklahoma Press,","Washington State Historical geography, Washing...",arbk,naatlr
4213260,493956,Historical atlas of Washington / by James W. S...,"Scott, James William, 1925-",806121084,[1988],"University of Oklahoma Press,","Washington State Historical geography, Washing...",acbk,naover
4306571,493956,Historical atlas of Washington / by James W. S...,"Scott, James William, 1925-",806121084,[1988],"University of Oklahoma Press,","Washington State Historical geography, Washing...",acbk,naatlas


In [48]:
books_df[books_df.Author.str.contains('^Martin, George R. R.', case=False)].Author.value_counts()

Author
Martin, George R. R.    90
Name: count, dtype: int64

In [54]:
isbn_df = books_df[['BibNum', 'ISBN']].drop_duplicates().copy()

In [66]:
isbn_df.ISBN = isbn_df.ISBN.str.split(',')

In [69]:
isbn_df = isbn_df.explode(column='ISBN', ignore_index=True)

In [78]:
isbn_df.ISBN = isbn_df.ISBN.str.strip()

In [80]:
isbn_df.drop_duplicates(inplace=True)

In [87]:
(isbn_df.ISBN.value_counts()>2).sum()

np.int64(192)

In [85]:
books_df[books_df.BibNum.isin(list(isbn_df[isbn_df.ISBN == '9780195311808'].BibNum))]

Unnamed: 0,BibNum,Title,Author,ISBN,PublicationYear,Publisher,Subjects,ItemType,ItemCollection
97812,2412114,John Brown / W.E.B. Du Bois ; introduction by ...,"Du Bois, W. E. B. (William Edward Burghardt), ...","0195311809, 0195325745, 9780195311808, 9780195...",c2007.,"Oxford University Press,","Brown John 1800 1859, Abolitionists United Sta...",acbk,naaab
140423,2412180,The ordeal of Mansart / W.E.B. Du Bois ; intro...,"Du Bois, W. E. B. (William Edward Burghardt), ...","0195311809, 0195325869, 9780195311808, 9780195...",c2007.,"Oxford University Press,","African Americans Southern States Fiction, Afr...",acbk,naaafic
153209,2411609,"Africa, its geography, people, and products ; ...","Du Bois, W. E. B. (William Edward Burghardt), ...","0195311809, 019532580X, 9780195311808, 9780195...",c2007.,"Oxford University Press,","Africa, Africa Geography",acbk,naaanf
227651,2412119,Mansart builds a school / W.E.B. Du Bois ; int...,"Du Bois, W. E. B. (William Edward Burghardt), ...","0195311809, 0195325877, 9780195311808, 9780195...",c2007.,"Oxford University Press,",African Americans Fiction,acbk,naaafic
238407,2412115,The Philadelphia Negro : a social study / W.E....,"Du Bois, W. E. B. (William Edward Burghardt), ...","0195311809, 0195325729, 9780195311808, 9780195...",c2007.,"Oxford University Press,","African Americans Social conditions, Africa Am...",acbk,naaanf
246437,2411608,Dusk of dawn : an essay toward an autobiograph...,"Du Bois, W. E. B. (William Edward Burghardt), ...","0195311809, 0195325834, 9780195311808, 9780195...",c2007.,"Oxford University Press,",Du Bois W E B William Edward Burghardt 1868 19...,acbk,naaanf
269625,2412116,The suppression of the African slave-trade to ...,"Du Bois, W. E. B. (William Edward Burghardt), ...","0195311809, 0195325710, 9780195311808, 9780195...",c2007.,"Oxford University Press,",Slave trade United States History,acbk,naaanf
290349,2411611,Darkwater : voices from within the veil / W.E....,"Du Bois, W. E. B. (William Edward Burghardt), ...","0195311809, 019532577X, 9780195311808, 9780195...",c2007.,"Oxford University Press,","African Americans, United States Race relations",acbk,naaanf
494817,2411612,The negro / W.E.B. Du Bois ; introduction by J...,"Du Bois, W. E. B. (William Edward Burghardt), ...","0195311809, 0195325761, 9780195311808, 9780195...",c2007.,"Oxford University Press,",Black race,acbk,naaanf
558604,2412122,Black folk then and now : an essay in the hist...,"Du Bois, W. E. B. (William Edward Burghardt), ...","0195311809, 0195325826, 9780195311808, 9780195...",c2007.,"Oxford University Press,","Black race, Blacks",acbk,naaanf


In [93]:
isbn_df.ISBN.str.len().value_counts()

ISBN
10    489352
13    322123
9        115
8         18
11         7
1          6
14         6
4          5
5          5
7          5
0          2
17         1
16         1
12         1
27         1
18         1
6          1
15         1
Name: count, dtype: int64

In [104]:
isbn_df = isbn_df[isbn_df.ISBN != '']

In [106]:
isbn_df.to_pickle('./isbn.pkl')

In [109]:
gr_isbns_df = pd.read_pickle('./datasets/goodreads/gr_isbns.pkl')

In [110]:
(isbn_df.ISBN.isin(gr_isbns_df.isbn) | isbn_df.ISBN.isin(gr_isbns_df.isbn13)).sum()

np.int64(355119)

In [111]:
(gr_isbns_df.isbn.isin(isbn_df.ISBN) | gr_isbns_df.isbn13.isin(isbn_df.ISBN)).sum()

np.int64(194123)

In [115]:
(gr_isbns_df.isbn13.isin(gr_isbns_df.isbn)).sum()

np.int64(263)

In [117]:
((gr_isbns_df.isbn != '') & (gr_isbns_df.isbn.isin(gr_isbns_df.isbn13))).sum()

np.int64(20)

In [119]:
list(gr_isbns_df[((gr_isbns_df.isbn != '') & (gr_isbns_df.isbn.isin(gr_isbns_df.isbn13)))].isbn)

['1622664051',
 '1937007375',
 '1439134006',
 '0671578855',
 '0803735006',
 '1496400801',
 '0374149208',
 '1400033888',
 '0062084267',
 '1622662687',
 '0385737947',
 '0802130798',
 '081013358X',
 '1481469827',
 '2253004219',
 '0758280343',
 '0316036234',
 '0062245414',
 '1482609509',
 '0345391802']

In [120]:
library_df.shape

(35531308, 13)

In [122]:
books_df.shape

(1043224, 9)

In [124]:
isbn_df.BibNum.value_counts()

BibNum
3021717    88
66056      86
2403978    73
75511      62
3139646    52
           ..
1847638     1
1837941     1
2361159     1
2057325     1
2164601     1
Name: count, Length: 428624, dtype: int64

In [127]:
list(isbn_df[isbn_df.BibNum == 66056].ISBN)

['0691015856',
 '069104533X',
 '0691045348',
 '0691045356',
 '0691045364',
 '0691045372',
 '0691045380',
 '0691045399',
 '0691045402',
 '0691045410',
 '0691045429',
 '0691045437',
 '0691045445',
 '0691045453',
 '0691045461',
 '069104547X',
 '0691045488',
 '0691045496',
 '0691045828',
 '0691045836',
 '0691046182',
 '0691046867',
 '0691046875',
 '0691047286',
 '0691047391',
 '0691047766',
 '0691047774',
 '0691047782',
 '0691047804',
 '0691090432',
 '0691094985',
 '0691118957',
 '0691124892',
 '069112910X',
 '0691135576',
 '0691137730',
 '0691137749',
 '069115001X',
 '069115323X',
 '0691156719',
 '0691160376',
 '0691164207',
 '0691170460',
 '9780691015859',
 '9780691045337',
 '9780691045344',
 '9780691045351',
 '9780691045368',
 '9780691045375',
 '9780691045382',
 '9780691045399',
 '9780691045405',
 '9780691045412',
 '9780691045429',
 '9780691045436',
 '9780691045443',
 '9780691045450',
 '9780691045467',
 '9780691045474',
 '9780691045481',
 '9780691045498',
 '9780691045825',
 '97806910458