# Get Makers from Database

In [24]:
from pymongo import MongoClient

client = MongoClient('localhost', 27017)
db = client['local']

subjects_collection = db['collectrium_stokstad_models_subjects']
makers = []
ids = []
n = 0

for artist in subjects_collection.find({'_type': 'Collectrium::Stokstad::Models::Artist', 'terms.0': {'$exists': True}}):
    
    if artist['approved']:
        for x in artist['terms']:
            makers.append(x['text'])
            ids.append(x['_id'])   

In [112]:
#np.save('makers.npy', makers)
makers = np.load('makers.npy')

In [113]:
print len(makers)

clean = []
for x in makers:
    clean.append(x.replace("'", '').replace("\t", '').replace(',', '').lstrip().rstrip().lower())

clean = sorted(list(set(clean)))

print len(clean)

834227
815534


# Build TireTree to Search Names

In [114]:
class TireNode:
    
    def __init__ (self):
        self.val = None
        self.mark = False
        self.pointers = {}
        
class Tire:
    
    def __init__ (self):
        self.root = TireNode()
        
    def insert(self, name):
        self.re_insert(name, self.root)
        return
    
    def re_insert(self, name, root):
        if name[0] not in root.pointers:
            newNode = TireNode()
            newNode.val = name[0]
            root.pointers[name[0]] = newNode
            self.re_insert(name, root)
        else:
            nextNode = root.pointers[name[0]]
            if(len(name[1:]) == 0):
                nextNode.mark = True
                return
            return self.re_insert(name[1:], nextNode)
        
    def search(self, name):
        if len(name) == 0:
            return False
        else:
            return self.re_search(name, self.root)
        
    def re_search(self, name, root):
        if name[0] not in root.pointers:
            return False
        else:
            nextNode = root.pointers[name[0]]
            if len(name[1:]) == 0:
                if nextNode.mark == True:
                    return True
                else:
                    return False
            else:
                return self.re_search(name[1:], nextNode)

In [115]:
def buildTree(names):
    
    tree = Tire()
    for x in names:
        tree.insert(x.split(' '))
    
    return tree

# Read Sotheby Data

In [116]:
import pandas as pd
import numpy as np
import collections
from collections import OrderedDict

newData = pd.read_csv("listings_1461025592.csv", warn_bad_lines=True, error_bad_lines=False)
oldData = pd.read_csv("sothebys_historical_data.csv", warn_bad_lines=True, error_bad_lines=False)

data = newData.append(oldData)
nullMaker = data[pd.isnull(data['Maker'])]

Skipping line 65972: expected 18 fields, saw 19

Skipping line 225573: expected 18 fields, saw 27



In [117]:
def searchTerms(sentence, tree):
    
    sentence = sentence.replace("'", '').replace("\t", '').replace(',', '').lstrip().rstrip().lower()
    terms = sentence.split(' ')
    
    for x in range(len(terms)):  
        for y in range(x, len(terms)):
            tomatch = terms[x:y+1]
            if tree.search(tomatch):
                return ' '.join(terms[x:y+1])
        
    return ''

In [118]:
tree = buildTree(clean)
makers = []

for index, row in nullMaker.iterrows():
    terms = row['Lot Title']
    try:
        maker = searchTerms(terms, tree)
        makers.append(maker)
    except:
        makers.append('')

In [119]:
nullMaker['Maker'] = makers

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [120]:
noMaker = nullMaker[nullMaker['Maker'] == '']

In [121]:
len(noMaker)/float(len(data))

0.1997585931469012

# Predicate Department By Sale Name

In [299]:
import pandas as pd
import numpy as np

data = pd.read_csv("sothebys_historical_data.csv", warn_bad_lines=True, error_bad_lines=False)

category = pd.read_csv('categorized sales by sale name.csv - sale_name.csv')

category.loc[category['Category'] == 'F', 'Category'] = 'Fine Arts'
category.loc[category['Category'] == 'D', 'Category'] = 'Decorative Arts'
category.loc[category['Category'] == 'WIne', 'Category'] = 'Wine'

cates = list(set(category['Category']))
print cates

Skipping line 225573: expected 18 fields, saw 27



['Books & Manuscripts', 'Jewelry', 'Fine Arts', 'Cars', 'Decorative Arts', 'Watches', 'Stamps', 'Musical instruments', 'Wine']


In [302]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.classify.naivebayes import NaiveBayesClassifier
from sklearn import cross_validation
from nltk import classify

def crossValidation(train_features):
    cv = cross_validation.KFold(len(train_features), n_folds=10, shuffle=False, random_state=None)

    for traincv, evalcv in cv:
        classifier = NaiveBayesClassifier.train(train_features[traincv[0]:traincv[len(traincv)-1]])
        print 'accuracy: %.3f' % classify.util.accuracy(classifier, train_features[evalcv[0]:evalcv[len(evalcv)-1]])

def getFeatureDict(features, featureArray):
    featureDict = {}
    
    for x in range(len(features)):
        featureDict[features[x]] = bool(featureArray[x])
        
    return featureDict

def extractFeatures(corpus):
    vectorizer = CountVectorizer(min_df=1, ngram_range=(1, 4), binary = True, decode_error='replace')
    X = vectorizer.fit_transform(corpus)
    
    return vectorizer.get_feature_names(), X.toarray()

def getTrainingset(features, featureMatrix, labels):
    featuresets = []
    
    for x in range(len(labels)):
        featuresets.append((getFeatureDict(features, featureMatrix[x]), labels[x]))
            
    return featuresets

features, featureMatrix = extractFeatures(category['Sale Name'])
labels = category['Category']
featuresets = getTrainingset(features, featureMatrix, labels)
nb_classifier = NaiveBayesClassifier.train(featuresets)
crossValidation(featuresets)

accuracy: 0.682
accuracy: 0.835
accuracy: 0.812
accuracy: 0.894
accuracy: 0.906
accuracy: 0.694
accuracy: 0.765
accuracy: 0.894
accuracy: 0.881
accuracy: 0.429


In [303]:
def getTestset(data):
    test = []
    
    features, featureMatrix = extractFeatures(data)
    for x in range(len(featureMatrix)):
        test.append((getFeatureDict(features, featureMatrix[x])))
    
    return test

test = getTestset(category['Sale Name'][:10])
# for pdist in nb_classifier.prob_classify_many(test):
#     print('%.4f %.4f' % (pdist.prob('Fine Arts'), pdist.prob('Wine')))
nb_classifier.classify_many(test)

['Musical instruments',
 'Cars',
 'Watches',
 'Watches',
 'Watches',
 'Watches',
 'Watches',
 'Watches',
 'Musical instruments',
 'Fine Arts']

In [304]:
saleNames = list(set(noMaker['Sale Name']))

In [345]:
noMaker = noMaker[pd.notnull(noMaker['Sale Name'])]
counter = collections.Counter(noMaker['Sale Name'])

In [347]:
dataDict = OrderedDict()
for x in counter.most_common():
    dataDict[x[0]] = noMaker[noMaker['Sale Name'] == x[0]]

In [352]:
name = []

for x in dataDict:
    name.append(x)

test = getTestset(name) 
result = nb_classifier.classify_many(test)

In [357]:
print counter.most_common(50)

[('Finest and Rarest Wines', 9667), ('Magnificent Jewels', 5230), ('Jewels', 3570), ('Important Jewels', 3466), ('Fine and Rare Wines, Spirits and Vintage Port', 3225), ('Magnificent Jewels and Jadeite', 2298), ('Fine Jewels', 2092), ('Fine Chinese Ceramics and Works of Art', 1913), ('Finest and Rarest Wines and Spirits: The 30th Anniversary Sale', 1681), ('Fine Chinese Ceramics & Works of Art', 1596), ('Books, Prints and Maps', 1436), ('Musical Instruments', 1360), ('Fine Jewelry', 1332), ('Arcade Jewels', 1170), ('Salvatore e Francesco Romano. Antiquari a Firenze.  A Century as Antique Dealers at Palazzo Magnani Feroni', 1086), ('The Estate of Ambassador Pamela Harriman', 1083), ('Important Watches, Wristwatches and Clocks', 1013), ('Centuries of StyleFurniture, Decorations, and Fine Art:Including the Stock-in-Trade of John WM. Martin Bookseller', 953), ('The Viennese Master Bronzes', 941), ('Centuries of Style', 933), ('Silver, Jewelry & Watches, Ceramics & Glass, Furniture, Works o

# Search Top Maker Terms

In [122]:
import pandas as pd
import numpy as np
import collections
from collections import OrderedDict

topMakers = pd.read_csv("vacabulary.csv", warn_bad_lines=True, error_bad_lines=False)

In [123]:
print len(topMakers['Term'])
makers = topMakers['Term']

clean = []
for x in makers:
    clean.append(x.replace("'", '').replace("\t", '').replace(',', '').lstrip().rstrip().lower())

clean = sorted(list(set(clean)))

print len(clean)

1950
1944


In [125]:
tree = buildTree(clean)
makers = []

for index, row in noMaker.iterrows():
    terms = row['Lot Title']
    try:
        maker = searchTerms(terms, tree)
        makers.append(maker)
    except:
        makers.append('')

In [126]:
noMaker['Maker'] = makers

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [129]:
print (len(data) -len(noMaker[noMaker['Maker'] == '']))/float(len(data))

0.821570884816


In [128]:
counter = collections.Counter(noMaker[noMaker['Maker'] == '']['Lot Title'])
print counter.most_common(100)

[(nan, 41062), ('MIXED LOT', 1891), ('Diamond ring', 1662), ('DIAMOND RING', 1207), ('Sapphire and diamond ring', 465), ('Platinum and Diamond Ring', 389), ('Diamond necklace', 354), ('SAPPHIRE AND DIAMOND RING', 317), ('DIAMOND BRACELET', 317), ('DIAMOND NECKLACE', 296), ('Diamond bracelet', 290), ('Emerald and diamond ring', 278), ('Pair of diamond earrings', 202), ('EMERALD AND DIAMOND RING', 198), ('MIXED LOT CB', 182), ('A DIAMOND RING', 166), ('PLATINUM AND DIAMOND RING', 158), ('A DIAMOND SOLITAIRE RING', 149), ('PAIR OF DIAMOND PENDENT EARRINGS', 134), ('PAIR OF DIAMOND EARRINGS', 124), ('A COLLECTION OF JEWELLERY', 120), ('Napol\xc3\xa9on 1er', 115), ('Platinum and Diamond Bracelet', 115), ('JADEITE AND DIAMOND RING', 114), ('Platinum, Sapphire and Diamond Ring', 113), ('Diamond brooch', 106), ('Wilde, Oscar.', 106), ('Unsigned', 102), ('DIAMOND PENDANT', 97), ('Diamond Ring', 94), ('Mixed Lot', 92), ('DIAMOND BROOCH', 83), ('No Author', 75), ('A DIAMOND BROOCH', 74), ('Kathar

In [133]:
allData = data[pd.notnull(data['Maker'])]
allData = allData.append(noMaker)
allData = allData.append(nullMaker[nullMaker['Maker'] != ''])

print len(allData), len(data)
allData.to_csv('Sotheby_Historical_Data.csv', index = False)

896412 896412


# Look into Lot Description

In [134]:
noneMaker = noMaker[noMaker['Maker'] == '']
print len(noneMaker)

159946


In [135]:
counter = collections.Counter(noMaker[noMaker['Maker'] == '']['Lot Description'])
print counter.most_common(100)

[(nan, 78379), ('N/A\nQuantity: 1', 72), ('This lot contains 1 item(s).\nQuantity: 1', 58), ('Ch\xc3\xa2teau Duhart Milon 2009\nPauillac, 4\xc3\xa8me Cru Class\xc3\xa9\nLot 1615: Banded owc, Lot 1616: Banded owc, Lot 1617: Banded owc, Lot 1618: Banded owc, Lot 1619: Banded owc, Lot 1620: Banded owc, Lot 1621: Banded owc, Lot 1622: Banded owc, Lot 1623: Banded owc, Lot 1624: Banded owc, Lot 1625: Banded owc, Lot 1626: Banded owc, Lot 1627: Banded owc, Lot 1628: Banded owc, Lot 1629: Banded owc, Lot 1630: Banded owc, Lot 1631: Banded owc, Lot 1632: Banded owc, Lot 1633: Banded owc, Lot 1634: Banded prior to inspection\n\n63% Cabernet Sauvignon, 37% Merlot. In 2013, a total blackcurrant nose. Intense black fruit depth and class on the palate. One can see the hand of Lafite here. A wine of real gravitas. Serena Sutcliffe, MW\n\n1615:12 bts (owc)\n\n1616:12 bts (owc)\n\n1617:12 bts (owc)\n\n1618:12 bts (owc)\n\n1619:12 bts (owc)\n\n1620:12 bts (owc)\n\n1621:12 bts (owc)\n\n1622:12 bts (owc)

# Return the Exactly Matching Terms

In [None]:
class TireNode:
    
    def __init__ (self):
        self.val = None
        self.mark = False
        self.index = None
        self.pointers = {}
        
class Tire:
    
    def __init__ (self):
        self.root = TireNode()
        
    def insert(self, name, index):
        self.re_insert(name, index, self.root)
        return
    
    def re_insert(self, name, index, root):
        if name[0] not in root.pointers:
            newNode = TireNode()
            newNode.val = name[0]
            root.pointers[name[0]] = newNode
            self.re_insert(name, index, root)
        else:
            nextNode = root.pointers[name[0]]
            if(len(name[1:]) == 0):
                nextNode.mark = True
                nextNode.index = index
                return
            return self.re_insert(name[1:], index, nextNode)
        
    def search(self, name):
        if len(name) == 0:
            return False
        else:
            return self.re_search(name, self.root)
        
    def re_search(self, name, root):
        if name[0] not in root.pointers:
            return False
        else:
            nextNode = root.pointers[name[0]]
            if len(name[1:]) == 0:
                if nextNode.mark == True:
                    return True, nextNode.index
                else:
                    return False, None
            else:
                return self.re_search(name[1:], nextNode)

In [None]:
def buildTree(names):
    
    tree = Tire()
    for x in range(len(names)):
        tree.insert(names[x].split(' '), x)
    
    return tree

In [None]:
def searchTerms(sentence, tree, vacabulary):
    
    sentence = sentence.replace("'", '').replace("\t", '').replace(',', '').lstrip().rstrip().lower()
    terms = sentence.split(' ')
    
    for x in range(len(terms)):  
        for y in range(x, len(terms)):
            tomatch = terms[x:y+1]
            response, index = tree.search(tomatch)
            if response:
                return vacabulary[index]
        
    return ''

In [None]:
tree = buildTree(clean)
makers = []

for index, row in noMaker.iterrows():
    terms = row['Lot Title']
    try:
        maker = searchTerms(terms, tree, record)
        makers.append(maker)
    except:
        makers.append('')