# Prepare notebook

In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load data set, clean, & scrape web for additional information

In [2]:
publishers = pd.read_csv('./goodreads_library_export_cleaned.csv',  index_col = 0)[['Publisher']].fillna('')


In [3]:
publishers['normalized'] = publishers['Publisher'].apply(lambda s: s.split('/')[0].strip())



In [4]:
unique_publishers = publishers['normalized'].drop_duplicates()
unique_publishers  = unique_publishers.reset_index(drop = True)

In [5]:
publisher_counts = publishers['normalized'].value_counts()

In [6]:
vectorizer = CountVectorizer(analyzer = 'word', stop_words = ['press', 'publisher', 'publishing', 'publishers',
                                                               'university', 'group', 'book', 'books'])#analyzer = 'char', ngram_range = (5,5), )
publisher_ngrams = vectorizer.fit_transform(unique_publishers)
#print(vectorizer.get_feature_names())

In [7]:
similarities = np.triu(cosine_similarity(publisher_ngrams))

In [8]:
matches = np.where((similarities > 0.5) & (similarities < 0.9999))

In [9]:
d = {}

for i, m1 in enumerate(matches[0]):
    if m1 not in d.keys():
        d[m1] = [unique_publishers[m1], unique_publishers[matches[1][i]]]    
    else:
        d[m1] += [unique_publishers[matches[1][i]]] 
                

In [10]:
publisher_map = {}
for v in d.values():
    standard_name = v[np.argmax([publisher_counts[p] for p in v])]
    v.remove(standard_name) 
    for p in v:
        publisher_map[p]=standard_name

In [11]:
for k,v in publisher_map.items():
    if v in publisher_map.keys():
        publisher_map[k] = publisher_map[v]

In [12]:
publisher_map = pd.DataFrame.from_dict(publisher_map, orient = 'index')
publisher_map.index.name = 'old_name'
publisher_map = publisher_map.rename(columns = {0:'new_name'})


In [13]:
publishers['index'] = publishers.index
publishers = pd.DataFrame(publishers).merge(publisher_map, left_on = 'normalized', 
                                            right_on = 'old_name', how = 'left')
publishers = publishers.set_index('index')

In [14]:
idx = publishers.new_name.notnull()
publishers.loc[idx, 'normalized'] = publishers.loc[idx, 'new_name']

In [15]:
publishers['stripped'] =publishers['normalized'].apply(lambda s: re.sub("([a-z])([A-Z])","\g<1> \g<2>", s)
                                                       .lower().replace('press', '')
                                                       .replace('publishers', '')
                                                       .replace('publishing', '')
                                                       .replace('publisher', '')
                                                       .replace('publications', '')
                                                       .replace('university', '')
                                                       .replace('square', '')
                                                       .replace('group', '')
                                                       .replace('books', '')
                                                       .replace('book', '')
                                                       .replace('editions', '')
                                                       .replace('audio','')
                                                       .strip()
                                                      )

In [16]:
unique_publishers  = publishers[['normalized','stripped']].reset_index(drop = True)

In [17]:
publisher_counts = publishers['normalized'].value_counts()

In [18]:
vectorizer = CountVectorizer(analyzer = 'char', ngram_range = (4,4), max_df = 0.7)
publisher_ngrams = vectorizer.fit_transform(unique_publishers['stripped'])
#print(vectorizer.get_feature_names())

In [19]:
similarities = np.triu(cosine_similarity(publisher_ngrams))

In [20]:
matches = np.where((similarities > 0.6) & (similarities < 0.9999))

In [21]:
d = {}

for i, m1 in enumerate(matches[0]):
    if m1 not in d.keys():
        d[m1] = [unique_publishers.loc[m1, 'normalized'], unique_publishers.loc[matches[1][i],'normalized']]    
    else:
        d[m1] += [unique_publishers.loc[matches[1][i],'normalized']] 
                

In [22]:
publisher_map = {}
for v in d.values():
    standard_name = v[np.argmax([publisher_counts[p] for p in v])]
    v.remove(standard_name) 
    for p in v:
        publisher_map[p]=standard_name

In [23]:
for k,v in publisher_map.items():
    if v in publisher_map.keys():
        publisher_map[k] = publisher_map[v]

In [24]:
publisher_map = pd.DataFrame.from_dict(publisher_map, orient = 'index')
publisher_map.index.name = 'old_name'
publisher_map = publisher_map.rename(columns = {0:'new_name_2'})


In [25]:
publishers['index'] = publishers.index
publishers = pd.DataFrame(publishers).merge(publisher_map, left_on = 'normalized', 
                                            right_on = 'old_name', how = 'left')
publishers = publishers.set_index('index')

In [26]:
idx = publishers.new_name_2.notnull()
publishers.loc[idx, 'normalized'] = publishers.loc[idx, 'new_name_2']

In [27]:
len(publishers.normalized.unique())

209

In [28]:
len(publishers.Publisher.unique())

274

In [29]:
publishers.normalized.value_counts()

Oxford University Press, USA    48
Penguin Books                   26
W. W. Norton  Company           21
Harper                          19
                                16
                                ..
Weidenfeld & Nicolson            1
NYBG                             1
Hachette                         1
Bolinda                          1
Bedford Books                    1
Name: normalized, Length: 209, dtype: int64

In [30]:
output = publishers[['normalized']].rename(columns = {'normalized': 'publisher'})
output.to_pickle('20191128_normalized_publisher.pkl')

In [31]:
publishers

Unnamed: 0_level_0,Publisher,normalized,new_name,stripped,new_name_2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Basic Books,Basic Books,,basic,
1,Knopf Publishing Group,Knopf,Knopf,knopf,
4,HarperAudio,Harper,,harper,Harper
5,Doubleday Books,Doubleday Books,,doubleday,
7,Hanover Square Press,Hanover Square Press,,hanover,
...,...,...,...,...,...
766,NYBG,NYBG,,nybg,
767,Basic Books,Basic Books,,basic,
768,Viking,Viking,,viking,
769,"Oxford University Press, USA","Oxford University Press, USA",,"oxford , usa",
