In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse as sp
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
# from sklearn.neighbors.nearest_centroid import NearestCentroid
import time
from sys import getsizeof

%load_ext sql

import pandas.io.sql as sqlio
import psycopg2
from nltk.corpus import stopwords

from sklearn.model_selection import KFold

conn = psycopg2.connect(host="localhost", port=5432, dbname="thegoldtree", user="postgres", password="postgres")

In [3]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.validation import check_array, check_X_y, check_is_fitted
from sklearn.utils.sparsefuncs import csc_median_axis_0
from sklearn.utils.multiclass import check_classification_targets

# This is a modified version of NearestCentroid from sklearn lib
class NearestCentroid(BaseEstimator, ClassifierMixin):

    def __init__(self, metric='euclidean', shrink_threshold=None):
        self.metric = metric
        self.shrink_threshold = shrink_threshold

    def fit(self, X, y):

        if self.metric == 'precomputed':
            raise ValueError("Precomputed is not supported.")
        # If X is sparse and the metric is "manhattan", store it in a csc
        # format is easier to calculate the median.
        if self.metric == 'manhattan':
            X, y = check_X_y(X, y, ['csc'])
        else:
            X, y = check_X_y(X, y, ['csr', 'csc'])
        is_X_sparse = sp.issparse(X)
        if is_X_sparse and self.shrink_threshold:
            raise ValueError("threshold shrinking not supported"
                             " for sparse input")
        check_classification_targets(y)

        n_samples, n_features = X.shape
        le = LabelEncoder()
        y_ind = le.fit_transform(y)
        self.classes_ = classes = le.classes_
        n_classes = classes.size
        if n_classes < 2:
            raise ValueError('The number of classes has to be greater than'
                             ' one; got %d class' % (n_classes))

        # Mask mapping each class to its members.
        self.centroids_ = sp.lil_matrix((n_classes, n_features), dtype=np.float64)
        # Number of clusters in each class.
        nk = np.zeros(n_classes)

        for cur_class in range(n_classes):
            center_mask = y_ind == cur_class
            nk[cur_class] = np.sum(center_mask)
            if is_X_sparse:
                center_mask = np.where(center_mask)[0]

            # XXX: Update other averaging methods according to the metrics.
            if self.metric == "manhattan":
                # NumPy does not calculate median of sparse matrices.
                if not is_X_sparse:
                    self.centroids_[cur_class] = np.median(X[center_mask], axis=0)
                else:
                    self.centroids_[cur_class] = csc_median_axis_0(X[center_mask])
            else:
                if self.metric != 'euclidean':
                    warnings.warn("Averaging for metrics other than "
                                  "euclidean and manhattan not supported. "
                                  "The average is set to be the mean."
                                  )
                self.centroids_[cur_class] = X[center_mask].mean(axis=0)

        if self.shrink_threshold:
            dataset_centroid_ = np.mean(X, axis=0)

            # m parameter for determining deviation
            m = np.sqrt((1. / nk) - (1. / n_samples))
            # Calculate deviation using the standard deviation of centroids.
            variance = (X - self.centroids_[y_ind]) ** 2
            variance = variance.sum(axis=0)
            s = np.sqrt(variance / (n_samples - n_classes))
            s += np.median(s)  # To deter outliers from affecting the results.
            mm = m.reshape(len(m), 1)  # Reshape to allow broadcasting.
            ms = mm * s
            deviation = ((self.centroids_ - dataset_centroid_) / ms)
            # Soft thresholding: if the deviation crosses 0 during shrinking,
            # it becomes zero.
            signs = np.sign(deviation)
            deviation = (np.abs(deviation) - self.shrink_threshold)
            np.clip(deviation, 0, None, out=deviation)
            deviation *= signs
            # Now adjust the centroids using the deviation
            msd = ms * deviation
            self.centroids_ = dataset_centroid_[np.newaxis, :] + msd
        return self

    def predict(self, X):

        check_is_fitted(self, 'centroids_')

        X = check_array(X, accept_sparse='csr')
        
        return np.argsort(pairwise_distances(X, self.centroids_, metric=self.metric))[:10]


In [4]:
sql = "select title, abstract, id_advisor, id, id_author from relationship;"
docs = sqlio.read_sql_query(sql, conn)
docs['title'] = docs['title'].apply(lambda x: x if isinstance(x, str) else '')
docs['abstract'] = docs['abstract'].apply(lambda x: x if isinstance(x, str) else '')
print("table relationship (just title and abstract) in pandas:")
print(getsizeof(docs['title'] + ' ' + docs['abstract']))

table relationship (just title and abstract) in pandas:
1229067454


In [3]:
sql = "select * from researcher;"
researcher = sqlio.read_sql_query(sql, conn)
print("table researcher in pandas:")
print(getsizeof(researcher))

table researcher in pandas:
115337586


In [5]:
vectorizer = pickle.load(open('vectorizer', 'rb'))
print("Vectorizer:")
print(getsizeof(vectorizer))
X_doc_vect_representation = pickle.load(open('X_doc_vect_representation', 'rb'))
print("X_doc_vect_representation:")
print(getsizeof(X_doc_vect_representation))
y_doc_vect_representation = pickle.load(open('y_doc_vect_representation', 'rb'))
print("y_doc_vect_representation:")
print(getsizeof(y_doc_vect_representation))
print(y_doc_vect_representation[1][400])

Vectorizer:
56
X_doc_vect_representation:
56
y_doc_vect_representation:
104
[580937]


In [5]:
train_y_doc_vect_representation = pickle.load(open('folds/fold1_train_y_doc_vect_representation', 'rb'))
print("train_y_doc_vect_representation:")
print(getsizeof(train_y_doc_vect_representation))

train_X_doc_vect_representation = pickle.load(open('folds/fold1_train_X_doc_vect_representation', 'rb'))
print("train_X_doc_vect_representation:")
print(getsizeof(train_X_doc_vect_representation))

test_y_doc_vect_representation = pickle.load(open('folds/fold1_test_y_doc_vect_representation', 'rb'))
print("test_y_doc_vect_representation:")
print(getsizeof(test_y_doc_vect_representation))

test_X_doc_vect_representation = pickle.load(open('folds/fold1_test_X_doc_vect_representation', 'rb'))
print("test_X_doc_vect_representation:")
print(getsizeof(test_X_doc_vect_representation))

fold1_clf = pickle.load(open('clfs/fold1_clf', 'rb'))
print("fold1_clf:")
print(getsizeof(fold1_clf))

train_y_doc_vect_representation:
96
train_X_doc_vect_representation:
56
test_y_doc_vect_representation:
104
test_X_doc_vect_representation:
56
fold1_clf:
56


In [4]:
clf_all_data1 = pickle.load(open('clfs/clf_all_data1', 'rb'))
print(clf_all_data1.centroids_.shape)
print("clf_all_data:")
print(getsizeof(clf_all_data1))

(79653, 883149)
clf_all_data:
56


In [5]:
# print(getsizeof(clf_all_data1.centroids_[0][0]))
aux = clf_all_data1.centroids_.data
aux = aux.tolist()

centroids = sp.csr_matrix(clf_all_data1.centroids_)
classes = clf_all_data1.classes_

n = 0
s = 0
for x in aux:
    s += getsizeof(x)
    n += 1
print(s)

print(clf_all_data1.classes_[70000])

388358688
573671


In [6]:
pickle.dump(centroids, open("centroids", "wb"))
pickle.dump(classes, open("classes", "wb"))

In [2]:
centroids = pickle.load(open('centroids', 'rb'))
classes = pickle.load(open('classes', 'rb'))

In [5]:
centroids.asfptype

<bound method spmatrix.asfptype of <79653x883149 sparse matrix of type '<class 'numpy.float64'>'
	with 42507558 stored elements in LInked List format>>

In [3]:
vectorizer = pickle.load(open('vectorizer', 'rb'))

data_vect = vectorizer.transform(["Um método para deduplicação de metadados bibliográficos baseado no empilhamento de classificadores"])

from sklearn.metrics.pairwise import pairwise_distances
predict = np.argsort(pairwise_distances(data_vect, centroids, metric='euclidean'))[:10]

In [24]:
sql = "select * from researcher;"
researcher = sqlio.read_sql_query(sql, conn)
pickle.dump(researcher, open("researcher", "wb"))

In [42]:
for i in classes[predict[0]][0:15]:
    print(researcher['name'][researcher['id'] == i].values[0])

renata de matos galante
george d. c. cavalcanti
plácida leopoldina ventura amorim da costa santos
lourenildo williame barbosa leite
jorge luís machado do amaral
george darmiton da cunha cavalcanti
alberto henrique frade laender
peter hubral
cibele cecilio de faria rozenfeld
carlos eduardo santos. pires
jacob scharcanski
anne magaly de paula canuto
jose palazzo moreira de oliveira
nina sumiko tomita hirata
paulo jorge leitão adeodato


In [2]:
vectorizer = pickle.load(open('vectorizer', 'rb'))

In [54]:
centroids = pickle.load(open('centroids', 'rb'))

In [56]:
print(centroids.shape)
print((centroids[7]*100000).astype(np.uint16))
centroids_min = (centroids*100000).astype(np.uint16)

(79653, 883149)
  (0, 1)	232
  (0, 382)	117
  (0, 997)	33
  (0, 1094)	190
  (0, 1140)	51
  (0, 2046)	33
  (0, 3103)	36
  (0, 3637)	29
  (0, 4434)	41
  (0, 6323)	123
  (0, 7363)	44
  (0, 7546)	38
  (0, 8459)	149
  (0, 8758)	39
  (0, 9303)	22
  (0, 9540)	49
  (0, 9677)	62
  (0, 9701)	54
  (0, 10134)	46
  (0, 10662)	40
  (0, 11120)	75
  (0, 11890)	49
  (0, 12506)	71
  (0, 13171)	101
  (0, 13791)	114
  :	:
  (0, 854579)	48
  (0, 855112)	23
  (0, 856661)	278
  (0, 857854)	38
  (0, 857989)	50
  (0, 859497)	129
  (0, 859822)	47
  (0, 860118)	129
  (0, 860158)	40
  (0, 860161)	36
  (0, 867276)	58
  (0, 867300)	67
  (0, 876759)	22
  (0, 876775)	25
  (0, 876810)	59
  (0, 879038)	77
  (0, 879047)	37
  (0, 879378)	397
  (0, 879379)	1306
  (0, 879841)	30
  (0, 879845)	24
  (0, 879848)	34
  (0, 879876)	28
  (0, 879883)	68
  (0, 879917)	29


In [55]:

for i in range(0,80000):
    centroid = ((centroids[i]*100000).astype(np.uint32)).data
#     print(centroid)
    for j in centroid:
        if j > 65535:
            print(str(j)+" é maior")


65940 é maior
82086 é maior
77984 é maior
77984 é maior
77984 é maior


IndexError: row index (79653) out of range

In [49]:
x = np.int(77984)
print(x)
x = np.uint16(65535)
print(x)

77984
65535


In [57]:
pickle.dump(centroids_min, open("centroids_min_uint16", "wb"))

In [18]:
centroids_min.dtype

dtype('int64')

In [4]:
from sklearn.metrics.pairwise import pairwise_distances

centroids_min = pickle.load(open('centroids_min_uint16', 'rb'))

vectorizer = pickle.load(open('vectorizer', 'rb'))

data_vect = vectorizer.transform(["Um método para deduplicação de metadados bibliográficos baseado no empilhamento de classificadores"])


from sklearn.metrics.pairwise import pairwise_distances
predict = np.argsort(pairwise_distances(((data_vect*100000).astype(np.uint16)), centroids_min, metric='euclidean'))[:10]

In [5]:
classes = pickle.load(open('classes', 'rb'))
researcher = pickle.load(open('researcher', 'rb'))

for i in classes[predict[0]][0:15]:
    print(researcher['name'][researcher['id'] == i].values[0])

renata de matos galante
george d. c. cavalcanti
plácida leopoldina ventura amorim da costa santos
lourenildo williame barbosa leite
jorge luís machado do amaral
george darmiton da cunha cavalcanti
alberto henrique frade laender
peter hubral
cibele cecilio de faria rozenfeld
carlos eduardo santos. pires
jacob scharcanski
anne magaly de paula canuto
jose palazzo moreira de oliveira
nina sumiko tomita hirata
paulo jorge leitão adeodato


In [6]:
centroids_min_csc = sp.csc_matrix(centroids_min)
pickle.dump(centroids_min_csc, open("centroids_min_uint16_csc", "wb"))

In [7]:
centroids_min_csc.dtype

dtype('uint16')