# Progetto Architetture Dati

In [217]:
import pandas as pd
import numpy as np
import math

In [218]:
with open('book.txt') as f:
    lines = f.readlines()

In [219]:
lines[1]

'Indoo.com\t0201853949\tArt of Computer Programming, Volume 4, Fascicle 3, The: Generating All Combinations and Partitions\tKnuth, Donald E.\n'

In [220]:
book_dataset = pd.read_csv('book.txt', sep='\t', names=['Website', 'ISBN', 'Title', 'Author'], header=None, na_values="NaN", dtype=str)

In [221]:
book_dataset

Unnamed: 0,Website,ISBN,Title,Author
0,eCampus.com,0201853949,"The art Of Computer Programming, Fascicle 3: G...",Not Available
1,Indoo.com,0201853949,"Art of Computer Programming, Volume 4, Fascicl...","Knuth, Donald E."
2,textbookxdotcom,0201853949,"The 'art Of Computer Programming, Fascicle 3 G...",
3,A1Books,0201853949,"The Art of Computer Programming, Volume 4, Fas...","Knuth, Donald E."
4,textbooksNow,0201853949,Art of Computer Programming,Knuth
...,...,...,...,...
33966,www.textbooksrus.com,0138569231,Multimedia Communications: Protocols and Appli...,Franklin F. Kuo|Wolfgang Effelsberg|J. J. Garc...
33967,OPOE-ABE Books,0138569231,MULTIMEDIA COMMUNICATIONS,KUO
33968,textbookxdotcom,0138569231,Multimedia Communications Protocols and Applic...,Wolfgang Effelsberg
33969,paperbackworld.de,0138569231,Multimedia Communications: Protocols and Appli...,"Frank Kuo, J. J. Luna"


In [222]:
book_dataset.dtypes

Website    object
ISBN       object
Title      object
Author     object
dtype: object

In [223]:
book_dataset = book_dataset.astype(str)
book_dataset.dtypes

Website    object
ISBN       object
Title      object
Author     object
dtype: object

In [224]:
book_dataset.replace('Not Available', np.NaN, inplace=True)

In [225]:
type(book_dataset.iloc[0,3])

float

In [226]:
book_dataset.isna()

Unnamed: 0,Website,ISBN,Title,Author
0,False,False,False,True
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
33966,False,False,False,False
33967,False,False,False,False
33968,False,False,False,False
33969,False,False,False,False


In [227]:
na_rows = book_dataset[book_dataset.isna().any(axis=1)]
na_rows

Unnamed: 0,Website,ISBN,Title,Author
0,eCampus.com,201853949,"The art Of Computer Programming, Fascicle 3: G...",
106,eCampus.com,201853930,"The Art Of Computer Programming, Fascicle 2: G...",
1442,brandnewtextbooks,321268458,Database Systems: An Application Oriented Appr...,
1443,brandnewtextbooks,321268458,Database Systems: An Application Oriented Appr...,
3786,eCampus.com,131498622,Spring Into PHP 5,
3787,eCampus.com,131498622,Spring Into PHP 5,
3797,Books2Anywhere.com,131498622,Spring Into Php 5,
3800,Books2Anywhere.com,131498622,Spring Into Php 5,
3802,Paperbackshop-US,131498622,Spring Into Php 5,
3803,DVD Legacy,131498622,Spring Into PHP 5,


## Preprocessing dati

In [228]:
book_dataset

Unnamed: 0,Website,ISBN,Title,Author
0,eCampus.com,0201853949,"The art Of Computer Programming, Fascicle 3: G...",
1,Indoo.com,0201853949,"Art of Computer Programming, Volume 4, Fascicl...","Knuth, Donald E."
2,textbookxdotcom,0201853949,"The 'art Of Computer Programming, Fascicle 3 G...",
3,A1Books,0201853949,"The Art of Computer Programming, Volume 4, Fas...","Knuth, Donald E."
4,textbooksNow,0201853949,Art of Computer Programming,Knuth
...,...,...,...,...
33966,www.textbooksrus.com,0138569231,Multimedia Communications: Protocols and Appli...,Franklin F. Kuo|Wolfgang Effelsberg|J. J. Garc...
33967,OPOE-ABE Books,0138569231,MULTIMEDIA COMMUNICATIONS,KUO
33968,textbookxdotcom,0138569231,Multimedia Communications Protocols and Applic...,Wolfgang Effelsberg
33969,paperbackworld.de,0138569231,Multimedia Communications: Protocols and Appli...,"Frank Kuo, J. J. Luna"


In [229]:
from recordlinkage.preprocessing import clean
for x in book_dataset.columns:
    book_dataset[x] = clean(book_dataset[x])

In [230]:
book_dataset

Unnamed: 0,Website,ISBN,Title,Author
0,ecampuscom,0201853949,the art of computer programming fascicle 3 gen...,
1,indoocom,0201853949,art of computer programming volume 4 fascicle ...,knuth donald e
2,textbookxdotcom,0201853949,the art of computer programming fascicle 3 gen...,
3,a1books,0201853949,the art of computer programming volume 4 fasci...,knuth donald e
4,textbooksnow,0201853949,art of computer programming,knuth
...,...,...,...,...
33966,wwwtextbooksruscom,0138569231,multimedia communications protocols and applic...,franklin f kuowolfgang effelsbergj j garcia lu...
33967,opoe abe books,0138569231,multimedia communications,kuo
33968,textbookxdotcom,0138569231,multimedia communications protocols and applic...,wolfgang effelsberg
33969,paperbackworldde,0138569231,multimedia communications protocols and applic...,frank kuo j j luna


## Indexing

In [237]:
import recordlinkage as rl
from recordlinkage.index import Block

indexer_isbn = rl.Index()
indexer_isbn.add(Block('ISBN'))
candidate_links_isbn = indexer_isbn.index(book_dataset)

In [238]:
print (len(book_dataset), len(candidate_links_isbn))

33971 628120


In [264]:
book_dataset.loc[candidate_links_isbn[0], "ISBN"]

1    0201853949
0    0201853949
Name: ISBN, dtype: object

In [245]:
book_dataset.loc[candidate_links_isbn[60000], "ISBN"]

2859    0321119045
2854    0321119045
Name: ISBN, dtype: object

## Comparison

In [269]:
# This cell can take some time to compute.
compare_cl = rl.Compare()

compare_cl.string('Title', 'Title', method='jarowinkler', threshold=0.85, label='Title')
compare_cl.string('Author', 'Author', method='jarowinkler', threshold=0.85, label='Author')

features = compare_cl.compute(candidate_links_isbn, book_dataset)

In [272]:
features.shape

(628120, 2)

In [273]:
# Sum the comparison results.
features.sum(axis=1).value_counts().sort_index(ascending=False)

2.0    275389
1.0    305050
0.0     47681
dtype: int64

In [274]:
matches = features[features.sum(axis=1) > 1]

print(len(matches))
matches.head(10)

275389


Unnamed: 0,Unnamed: 1,Title,Author
3,1,1.0,1.0
4,1,1.0,1.0
6,1,1.0,1.0
6,4,1.0,1.0
7,1,1.0,1.0
7,3,1.0,1.0
9,8,1.0,1.0
10,1,1.0,1.0
10,3,1.0,1.0
10,7,1.0,1.0


In [276]:
book_dataset.iloc[3,:]

Website                                              a1books
ISBN                                              0201853949
Title      the art of computer programming volume 4 fasci...
Author                                        knuth donald e
Name: 3, dtype: object

In [277]:
book_dataset.iloc[1,:]

Website                                             indoocom
ISBN                                              0201853949
Title      art of computer programming volume 4 fascicle ...
Author                                        knuth donald e
Name: 1, dtype: object

## Classification

In [278]:
# Initialise the Expectation-Conditional Maximisation classifier.
cl = rl.KMeansClassifier()
cl.fit(features)

In [279]:
# evaluate the model
links = cl.predict(features)
print("Predicted number of links:", len(links))

Predicted number of links: 314232


In [296]:
features

Unnamed: 0,Unnamed: 1,Title,Author
1,0,0.0,0.0
2,0,1.0,0.0
2,1,0.0,0.0
3,0,1.0,0.0
3,1,1.0,1.0
...,...,...,...
33970,33965,1.0,0.0
33970,33966,1.0,1.0
33970,33967,1.0,0.0
33970,33968,1.0,0.0


In [304]:
cl.predict(array.reshape(features.iloc[2]))

AttributeError: module 'array' has no attribute 'reshape'