# Progetto Architetture Dati

In [2]:
import pandas as pd
import numpy as np
import math

In [3]:
with open('book.txt') as f:
    lines = f.readlines()

In [4]:
lines[1]

'Indoo.com\t0201853949\tArt of Computer Programming, Volume 4, Fascicle 3, The: Generating All Combinations and Partitions\tKnuth, Donald E.\n'

In [5]:
book_dataset = pd.read_csv('book.txt', sep='\t', names=['Website', 'ISBN', 'Title', 'Author'], header=None, na_values="NaN", dtype=str)

In [6]:
book_dataset

Unnamed: 0,Website,ISBN,Title,Author
0,eCampus.com,0201853949,"The art Of Computer Programming, Fascicle 3: G...",Not Available
1,Indoo.com,0201853949,"Art of Computer Programming, Volume 4, Fascicl...","Knuth, Donald E."
2,textbookxdotcom,0201853949,"The 'art Of Computer Programming, Fascicle 3 G...",
3,A1Books,0201853949,"The Art of Computer Programming, Volume 4, Fas...","Knuth, Donald E."
4,textbooksNow,0201853949,Art of Computer Programming,Knuth
...,...,...,...,...
33966,www.textbooksrus.com,0138569231,Multimedia Communications: Protocols and Appli...,Franklin F. Kuo|Wolfgang Effelsberg|J. J. Garc...
33967,OPOE-ABE Books,0138569231,MULTIMEDIA COMMUNICATIONS,KUO
33968,textbookxdotcom,0138569231,Multimedia Communications Protocols and Applic...,Wolfgang Effelsberg
33969,paperbackworld.de,0138569231,Multimedia Communications: Protocols and Appli...,"Frank Kuo, J. J. Luna"


In [7]:
book_dataset.dtypes

Website    object
ISBN       object
Title      object
Author     object
dtype: object

In [8]:
book_dataset = book_dataset.astype(str)
book_dataset.dtypes

Website    object
ISBN       object
Title      object
Author     object
dtype: object

In [9]:
book_dataset.replace('Not Available', np.NaN, inplace=True)

In [10]:
type(book_dataset.iloc[0,3])

float

In [11]:
book_dataset.isna()

Unnamed: 0,Website,ISBN,Title,Author
0,False,False,False,True
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
33966,False,False,False,False
33967,False,False,False,False
33968,False,False,False,False
33969,False,False,False,False


In [12]:
na_rows = book_dataset[book_dataset.isna().any(axis=1)]
na_rows

Unnamed: 0,Website,ISBN,Title,Author
0,eCampus.com,201853949,"The art Of Computer Programming, Fascicle 3: G...",
106,eCampus.com,201853930,"The Art Of Computer Programming, Fascicle 2: G...",
1442,brandnewtextbooks,321268458,Database Systems: An Application Oriented Appr...,
1443,brandnewtextbooks,321268458,Database Systems: An Application Oriented Appr...,
3786,eCampus.com,131498622,Spring Into PHP 5,
3787,eCampus.com,131498622,Spring Into PHP 5,
3797,Books2Anywhere.com,131498622,Spring Into Php 5,
3800,Books2Anywhere.com,131498622,Spring Into Php 5,
3802,Paperbackshop-US,131498622,Spring Into Php 5,
3803,DVD Legacy,131498622,Spring Into PHP 5,


## Preprocessing dati

In [13]:
book_dataset

Unnamed: 0,Website,ISBN,Title,Author
0,eCampus.com,0201853949,"The art Of Computer Programming, Fascicle 3: G...",
1,Indoo.com,0201853949,"Art of Computer Programming, Volume 4, Fascicl...","Knuth, Donald E."
2,textbookxdotcom,0201853949,"The 'art Of Computer Programming, Fascicle 3 G...",
3,A1Books,0201853949,"The Art of Computer Programming, Volume 4, Fas...","Knuth, Donald E."
4,textbooksNow,0201853949,Art of Computer Programming,Knuth
...,...,...,...,...
33966,www.textbooksrus.com,0138569231,Multimedia Communications: Protocols and Appli...,Franklin F. Kuo|Wolfgang Effelsberg|J. J. Garc...
33967,OPOE-ABE Books,0138569231,MULTIMEDIA COMMUNICATIONS,KUO
33968,textbookxdotcom,0138569231,Multimedia Communications Protocols and Applic...,Wolfgang Effelsberg
33969,paperbackworld.de,0138569231,Multimedia Communications: Protocols and Appli...,"Frank Kuo, J. J. Luna"


In [14]:
from recordlinkage.preprocessing import clean
for x in book_dataset.columns:
    book_dataset[x] = clean(book_dataset[x])

In [15]:
book_dataset

Unnamed: 0,Website,ISBN,Title,Author
0,ecampuscom,0201853949,the art of computer programming fascicle 3 gen...,
1,indoocom,0201853949,art of computer programming volume 4 fascicle ...,knuth donald e
2,textbookxdotcom,0201853949,the art of computer programming fascicle 3 gen...,
3,a1books,0201853949,the art of computer programming volume 4 fasci...,knuth donald e
4,textbooksnow,0201853949,art of computer programming,knuth
...,...,...,...,...
33966,wwwtextbooksruscom,0138569231,multimedia communications protocols and applic...,franklin f kuowolfgang effelsbergj j garcia lu...
33967,opoe abe books,0138569231,multimedia communications,kuo
33968,textbookxdotcom,0138569231,multimedia communications protocols and applic...,wolfgang effelsberg
33969,paperbackworldde,0138569231,multimedia communications protocols and applic...,frank kuo j j luna


## Indexing

In [16]:
import recordlinkage as rl
from recordlinkage.index import Block

print('Build index...')
indexer_isbn = rl.Index()
indexer_isbn.add(Block('ISBN'))
candidate_links_isbn = indexer_isbn.index(book_dataset)

Build index...


In [17]:
print (len(book_dataset),'records in dataset', len(candidate_links_isbn), 'candidate links in dataset')

33971 records in dataset 628120 candidate links in dataset


In [18]:
book_dataset.loc[candidate_links_isbn[0], "ISBN"]

1    0201853949
0    0201853949
Name: ISBN, dtype: object

In [19]:
book_dataset.loc[candidate_links_isbn[60000], "ISBN"]

2859    0321119045
2854    0321119045
Name: ISBN, dtype: object

## Comparison

In [20]:
# This cell can take some time to compute.
print('Start comparing...')
compare_cl = rl.Compare()

compare_cl.string('Title', 'Title', method='jarowinkler', threshold=0.85, label='Title')
compare_cl.string('Author', 'Author', method='jarowinkler', threshold=0.85, label='Author')

features = compare_cl.compute(candidate_links_isbn, book_dataset)
print('feature shape', features.shape)

Start comparing...
feature shape (628120, 2)


In [21]:
# Sum the comparison results.
features.sum(axis=1).value_counts().sort_index(ascending=False)

2.0    278796
1.0    302016
0.0     47308
dtype: int64

In [22]:
matches = features[features.sum(axis=1) == 2]

print(len(matches))
matches.head(10)

278796


Unnamed: 0,Unnamed: 1,Title,Author
3,1,1.0,1.0
4,1,1.0,1.0
6,1,1.0,1.0
6,4,1.0,1.0
7,1,1.0,1.0
7,3,1.0,1.0
9,8,1.0,1.0
10,1,1.0,1.0
10,3,1.0,1.0
10,7,1.0,1.0


In [23]:
book_dataset.iloc[3,:]

Website                                              a1books
ISBN                                              0201853949
Title      the art of computer programming volume 4 fasci...
Author                                        knuth donald e
Name: 3, dtype: object

In [24]:
book_dataset.iloc[1,:]

Website                                             indoocom
ISBN                                              0201853949
Title      art of computer programming volume 4 fascicle ...
Author                                        knuth donald e
Name: 1, dtype: object

## Classification

In [25]:
# Initialise the Expectation-Conditional Maximisation classifier.
cl = rl.ECMClassifier()
cl.fit(features)

In [26]:
# Print the parameters that are trained (m, u and p).
print("p probability P(Match):", cl.p)
print("m probabilities P(x_i=1|Match):", cl.m_probs)
print("u probabilities P(x_i=1|Non-Match):", cl.u_probs)
print("log m probabilities P(x_i=1|Match):", cl.log_m_probs)
print("log u probabilities P(x_i=1|Non-Match):", cl.log_u_probs)
print("log weights of features:", cl.log_weights)
print("weights of features:", cl.weights)

p probability P(Match): 0.40892784747616334
m probabilities P(x_i=1|Match): {'Title': {0.0: 0.10885126155617526, 1.0: 0.891148738443825}, 'Author': {0.0: 0.1266087458120186, 1.0: 0.8733912541879818}}
u probabilities P(x_i=1|Non-Match): {'Title': {0.0: 0.15776344757671365, 1.0: 0.8422365524232858}, 'Author': {0.0: 0.7476637504773161, 1.0: 0.25233624952268324}}
log m probabilities P(x_i=1|Match): {'Title': {0.0: -2.21777290150953, 1.0: -0.11524393116291698}, 'Author': {0.0: -2.0666536894146965, 1.0: -0.13537165150499142}}
log u probabilities P(x_i=1|Non-Match): {'Title': {0.0: -1.846658535065922, 1.0: -0.1716943630760852}, 'Author': {0.0: -0.29080193352464967, 1.0: -1.376992757386514}}
log weights of features: {'Title': {0.0: -0.3711143664436083, 1.0: 0.05645043191316823}, 'Author': {0.0: -1.7758517558900468, 1.0: 1.2416211058815225}}
weights of features: {'Title': {0.0: 0.6899650281998656, 1.0: 1.0580741667881892}, 'Author': {0.0: 0.16933915243475467, 1.0: 3.46121992317822}}


In [27]:
# evaluate the model
links = cl.predict(features)
print("Predicted number of links:", len(links))

Predicted number of links: 318019


In [28]:
features

Unnamed: 0,Unnamed: 1,Title,Author
1,0,0.0,0.0
2,0,1.0,0.0
2,1,0.0,0.0
3,0,1.0,0.0
3,1,1.0,1.0
...,...,...,...
33970,33965,1.0,1.0
33970,33966,1.0,1.0
33970,33967,1.0,0.0
33970,33968,1.0,0.0


In [29]:
with open('book_silver.txt') as f:
    lines = f.readlines()

In [30]:
lines[1]

'9781558608597\tjohn c strassner\n'

In [31]:
book_silver = pd.read_csv('book_silver.txt', sep='\t', names=['ISBN', 'Author'], header=None, na_values="NaN", dtype=str)

In [32]:
book_silver

Unnamed: 0,ISBN,Author
0,9780321356413,andrew conry-murray; vincent weafer
1,9781558608597,john c strassner
2,9780201325812,alan dickman
3,9781558608306,ashley friedlein
4,9781555582906,michael erbschloe
...,...,...
1260,9780131452541,steve cunningham
1261,9780130166388,kevin chu; eric brower
1262,9781576761359,felke
1263,9780072847154,john hirschbuhl


In [36]:
s11 = pd.Series(list(map(tuple, book_dataset[['ISBN', 'Author']].values.tolist())))
s12 = pd.Series(list(map(tuple, book_silver[['ISBN', 'Author']].values.tolist())))
book_dataset.loc[~s11.isin(s12), 'Author'] = 'Mismatch'
print(book_silver)

               ISBN           Author
0     9780321356413         Mismatch
1     9781558608597         Mismatch
2     9780201325812         Mismatch
3     9781558608306         Mismatch
4     9781555582906         Mismatch
...             ...              ...
1260  9780131452541         Mismatch
1261  9780130166388         Mismatch
1262  9781576761359         Mismatch
1263  9780072847154  john hirschbuhl
1264  9781558603547         Mismatch

[1265 rows x 2 columns]


In [37]:
book_silver.groupby('Author').count()

Unnamed: 0_level_0,ISBN
Author,Unnamed: 1_level_1
Mismatch,1187
alan simpson,1
anil desai,2
barrie sosinsky,1
behrouz a forouzan,1
brian culp,1
carey holzman,1
carol yacht,1
carole tobias,2
colin smith,1
