In [1]:
import pandas as pd
import numpy as np

from datetime import datetime
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

import database as db
import request_category as rc



In [2]:
## Original
def get_data(*categories, all = False):
    queries = []
    for category in categories:
        cat_query = db.query_pages_by_category( category)
        
        queries.append( cat_query)
    if len( categories) > 1:
        pages_query = """SELECT b.category, b.subcategory, b.title, b.pageid, b.article
                 FROM (({}) UNION ({}) ) as b;""".format( queries[0], queries[1])
    else: 
        pages_query = cat_query + ";"
        
    #ml_query = db.query_pages_by_category( categories[0])  # 'machine learning'
    #bs_query = db.query_pages_by_category( categories[1])   # 'business software'
    #pages_query = """SELECT b.category, b.subcategory, b.title, b.pageid, b.article
    #             FROM (({}) UNION ({}) ) as b;""".format( ml_query, bs_query)

    pages_df = db.query_to_dataframe( pages_query)
    
    empty_mask =  pages_df.article == ''
    nonempty_pages_df = pages_df[~empty_mask].reset_index(drop = True).copy()
    nonempty_pages_df.index = nonempty_pages_df.pageid
    nonempty_pages_df.drop( ['pageid'], axis = 1, inplace = True)
    

    return nonempty_pages_df

In [53]:
## New
## Set unique = True for Recommendation, False for Classification
def get_data(*categories, unique = False):  
    queries = []
    for category in categories:
        cat_query = db.query_pages_by_category( category)
        
        queries.append( cat_query)
    if len( categories) > 1:  ## Only works for n_categories = 2 
        pages_query = """SELECT b.category, b.subcategory, b.title, b.pageid, b.article
                 FROM (({}) UNION ({}) ) as b;""".format( queries[0], queries[1])
    else: 
        pages_query = cat_query + ";"
        
    #ml_query = db.query_pages_by_category( categories[0])  # 'machine learning'
    #bs_query = db.query_pages_by_category( categories[1])   # 'business software'
    #pages_query = """SELECT b.category, b.subcategory, b.title, b.pageid, b.article
    #             FROM (({}) UNION ({}) ) as b;""".format( ml_query, bs_query)

    pages_df = db.query_to_dataframe( pages_query)
    
    empty_mask =  pages_df.article == ''
    nonempty_pages_df = pages_df[~empty_mask].reset_index(drop = True).copy()
    nonempty_pages_df.index = nonempty_pages_df.pageid
    
    if unique:
        nonempty_pages_df.drop_duplicates(subset = ['pageid', 'title'],inplace = True)
        nonempty_pages_df.reset_index( drop = True, inplace= True) 
    
    nonempty_pages_df.drop( ['pageid'], axis = 1, inplace = True)
    
    
    

    return nonempty_pages_df

In [55]:
## New Trial, using fit for initial pipe
## Success

class Recommend1():
    def __init__(self, min_df=.05, max_df=.90, ngram_range=(1,3), n_components=100):
        self.min_df = min_df
        self.max_df = max_df
        self.ngram_range = ngram_range
        self.n_components = n_components
        self.component_names = ["component_" + str(i+1) for i in range(self.n_components)]
        self.pipe = None
        self.lsa_df = None #pd.DataFrame()
        
    def fit(self, X, Y=None):
        self.pipe = Pipeline([
            ('encoder', TfidfVectorizer( min_df = self.min_df, max_df = self.max_df, 
                                        ngram_range = self.ngram_range, stop_words = 'english')),
            ('truncator',TruncatedSVD( n_components=self.n_components) ),
        ])
        #component_names = ["component_" + str(i+1) for i in range(self.n_components)]
        self.lsa_df = pd.DataFrame( self.pipe.fit_transform(X.article), index=X.index, columns=self.component_names)
        self.lsa_df = X[['category','subcategory', 'title','article']].merge( \
                        self.lsa_df, how = 'outer', copy = True,
                        left_index = True, right_index = True, suffixes = ('', ''))
        return self

    def transform(self, X, Y=None): 
        #component_names = ["component_" + str(i+1) for i in range(self.n_components)]
        if Y is not None:
            search_doc, pageid = rc.get_article(Y)  ## Y = 'Saffron Technology'

        else:
            pageid = X.index[1]
            search_doc = X.article[pageid]
            
        article_lsa_df = pd.DataFrame( self.pipe.transform( [search_doc]), index=[int(pageid)],columns=self.component_names )
        search_cosine_df = pd.DataFrame( cosine_similarity( self.lsa_df.drop(['category','subcategory','title','article'], axis = 1), article_lsa_df ), columns = ['cosine'], index = self.lsa_df.index)
        search_cosine_df = X[['category', 'subcategory', 'title']] \
            .merge( search_cosine_df, how = 'outer', 
                   left_index = True, right_index = True, copy = True, suffixes = ('', '') )
        search_cosine_df.index = search_cosine_df.title
        search_cosine_df.drop( ['title'], axis = 1, inplace = True)
        return search_cosine_df.sort_values(by = 'cosine', ascending=False)[0:5]
            
    def fit_transform(self, X, Y=None):
        self.fit(X, Y)
        return self.transform(X, Y)

In [54]:
nonempty_pages_df = get_data('machine learning', 'business software', unique = True) 
nonempty_pages_df.shape

(3791, 4)

In [56]:
rec1 = Recommend1(n_components = 700)

In [41]:
article = 'Saffron Technology'
start = datetime.now()
rec1.fit_transform(nonempty_pages_df, article)
timeduration = round((datetime.now()-start).seconds/60,2) ## minutes
print( timeduration)

0.38


In [57]:
start = datetime.now()
rec1.fit(nonempty_pages_df)
timeduration = round((datetime.now()-start).seconds/60,2) ## minutes
print( timeduration)

0.35


In [58]:
article = 'Saffron Technology'
rec1.transform( nonempty_pages_df, article)

Unnamed: 0_level_0,category,subcategory,cosine
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
autoassociative memory,machine learning,Artificial neural networks,0.509337
sisense,business software,business software,0.435828
semantic research,business software,Business software companies,0.404608
datawatch corporation,business software,business software,0.401898
genetic memory computer science,machine learning,Genetic algorithms,0.379453


In [59]:
article = 'Machine learning'
rec1.transform( nonempty_pages_df, article)

Unnamed: 0_level_0,category,subcategory,cosine
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
machine learning,machine learning,machine learning,0.999944
quantum machine learning,machine learning,machine learning,0.872749
outline of machine learning,machine learning,machine learning,0.865648
adversarial machine learning,machine learning,machine learning,0.809982
meta learning computer science,machine learning,machine learning,0.766387


In [60]:
article = 'Neuron'
rec1.transform( nonempty_pages_df, article)

Unnamed: 0_level_0,category,subcategory,cosine
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
neural backpropagation,machine learning,Artificial neural networks,0.515898
nervous system network models,machine learning,Artificial neural networks,0.414309
gene expression programming,machine learning,Gene expression programming,0.32971
artificial neuron,machine learning,Artificial neural networks,0.31688
julia programming language,machine learning,Data mining and machine learning software,0.303413


In [62]:
article = 'Microsoft'
rec1.transform( nonempty_pages_df, article)

Unnamed: 0_level_0,category,subcategory,cosine
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
microsoft,business software,Business software companies,0.999983
microsoft dynamics crm,business software,Customer relationship management software,0.789586
microsoft dynamics erp,business software,business software,0.767598
microsoft dynamics,business software,business software,0.767431
microsoft small business financials,business software,Accounting software,0.68174


In [63]:
article = 'brain'
rec1.transform( nonempty_pages_df, article)

Unnamed: 0_level_0,category,subcategory,cosine
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
hierarchical temporal memory,machine learning,Deep learning,0.42763
nervous system network models,machine learning,Artificial neural networks,0.411222
activity recognition,machine learning,Applied machine learning,0.372448
types of artificial neural networks,machine learning,Classification algorithms,0.365481
convolutional neural network,machine learning,Artificial neural networks,0.361711


In [26]:
article = 'brain'

start = datetime.now()
article_recs = rec.transform( nonempty_pages_df, article)

timeduration = round((datetime.now()-start).seconds/60,2) ## minutes
print( timeduration)

article_recs

0.0


Unnamed: 0_level_0,category,subcategory,cosine
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
hierarchical temporal memory,machine learning,Semisupervised learning,0.422231
hierarchical temporal memory,machine learning,Semisupervised learning,0.422231
hierarchical temporal memory,machine learning,Unsupervised learning,0.422231
hierarchical temporal memory,machine learning,Semisupervised learning,0.422231
hierarchical temporal memory,machine learning,Unsupervised learning,0.422231


In [None]:
start = datetime.now()
rand_recs = rec.fit_transform(nonempty_pages_df)

timeduration = round((datetime.now()-start).seconds/60,2) ## minutes
print( timeduration)

rand_recs#.head()

In [None]:
article = 'Saffron Technology'

start = datetime.now()
article_recs = rec.transform( nonempty_pages_df, article)

timeduration = round((datetime.now()-start).seconds/60,2) ## minutes
print( timeduration)

article_recs

In [3]:
## My Attempt

class Recommend():
    
    def __init__(self, min_df=.05, max_df=.90, ngram_range=(1,3), n_components=100):
        self.min_df = min_df
        self.max_df = max_df
        self.ngram_range = ngram_range
        self.n_components = n_components
    
    def fit(self, X, Y):
        
        
        return self

    def transform(self, X, Y=None):
        pipe = Pipeline([
            ('encoder', TfidfVectorizer( min_df = self.min_df, max_df = self.max_df, 
                                        ngram_range = self.ngram_range, stop_words = 'english')),
            ('truncator',TruncatedSVD( n_components=self.n_components) ),
        ])
        
        component_names = ["component_" + str(i+1) for i in range(self.n_components)]
        
        lsa_df = pd.DataFrame( pipe.fit_transform(X.article), index=X.index, columns=component_names)

        LSA_df = X[['category','subcategory', 'title','article']].merge( \
                        lsa_df, how = 'outer', copy = True,
                        left_index = True, right_index = True, suffixes = ('', ''))
        
        if Y is not None:
            search_doc, pageid = rc.get_article(Y)  ## Y = 'Saffron Technology'

        else:
            pageid = X.index[1]
            search_doc = X.article[pageid]
            
        article_lsa_df = pd.DataFrame( pipe.transform( [search_doc]), index=[int(pageid)],columns=component_names )
        search_cosine_df = pd.DataFrame( cosine_similarity( LSA_df.drop(['category','subcategory','title','article'], axis = 1), article_lsa_df ), columns = ['cosine'], index = LSA_df.index)
        search_cosine_df = X[['category', 'subcategory', 'title']] \
            .merge( search_cosine_df, how = 'outer', 
                   left_index = True, right_index = True, copy = True, suffixes = ('', '') )
        search_cosine_df.index = search_cosine_df.title
        search_cosine_df.drop( ['title'], axis = 1, inplace = True)
        return search_cosine_df.sort_values(by = 'cosine', ascending=False)[0:5]
            
    def fit_transform(self, X, Y=None):
        self.fit(X, Y)
        return self.transform(X, Y)

In [4]:
nonempty_pages_df = get_data('machine learning', 'business software') 
nonempty_pages_df.shape

(4582, 4)

In [5]:
#nonempty_pages_df.article[ nonempty_pages_df.index[1]]
nonempty_pages_df.title[ nonempty_pages_df.index[1]]

'constant contact'

In [6]:
rec = Recommend(n_components = 700)

## Fit_Transform

In [7]:
start = datetime.now()
rand_recs = rec.fit_transform(nonempty_pages_df)

timeduration = round((datetime.now()-start).seconds/60,2) ## minutes
print( timeduration)

rand_recs#.head()

0.38


Unnamed: 0_level_0,category,subcategory,cosine
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
constant contact,business software,Customer relationship management software,0.999947
lithium technologies,business software,business software,0.552961
conversocial,business software,business software,0.529392
buckaroocom,business software,business software,0.482412
hubspot,business software,Marketing software,0.469031


## Transform

In [8]:
article = 'Saffron Technology'

start = datetime.now()
article_recs = rec.transform( nonempty_pages_df, article)

timeduration = round((datetime.now()-start).seconds/60,2) ## minutes
print( timeduration)

article_recs

0.38


Unnamed: 0_level_0,category,subcategory,cosine
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
autoassociative memory,machine learning,Artificial neural networks,0.50825
sisense,business software,business software,0.434608
semantic research,business software,Business software companies,0.407951
datawatch corporation,business software,business software,0.398055
genetic memory computer science,machine learning,Genetic algorithms,0.390902


In [None]:
class TemplateClassifier(BaseEstimator, ClassifierMixin):

    def __init__(self, *categories, min_df=.05, max_df=.90, ngram_range=(1,3), n_components=100):
        #self.categories = categories
        self.min_df = min_df
        self.max_df = max_df
        self.ngram_range = ngram_range
        self.n_components = n_components

    def fit(self, X, y):
        
        # Check that X and y have correct shape
        
        
        
        X, y = check_X_y(X, y)
        
        
        # Store the classes seen during fit
        self.classes_ = unique_labels(y)

        self.X_ = X
        self.y_ = y
        # Return the classifier
        return self

    def predict(self, X):

        # Check is fit had been called
        check_is_fitted(self, ['X_', 'y_'])

        # Input validation
        X = check_array(X)

        closest = np.argmin(euclidean_distances(X, self.X_), axis=1)
        return self.y_[closest]

In [None]:
## Original - https://stackoverflow.com/questions/43232506/using-pipeline-with-custom-classes-in-sklearn

class MyFeatureSelector():
    def __init__(self, features=5, method='pca'):
        self.features = features
        self.method = method
        self.selector = None
        self.init_selector()

    def init_selector():
        if self.method == 'pca':
            self.selector = PCA(n_components=self.features)
        elif self.method == 'rfe':
        self.selector = RFE(estimator=LinearRegression(n_jobs=-1),
                               n_features_to_select=self.features,
                               step=1)
    def fit(self, X, Y):
        return self

    def transform(self, X, Y=None):
        try:
            if self.features < X.shape[1]:
                if Y is not None:
                    self.selector.fit(X, Y)
                return selector.transform(X)
        except Exception as err:
            print('MyFeatureSelector.transform(): {}'.format(err))
        return X

    def fit_transform(self, X, Y=None):
        self.fit(X, Y)
        return self.transform(X, Y)

In [None]:
## Original http://scikit-learn.org/dev/developers/contributing.html#rolling-your-own-estimator

class TemplateClassifier(BaseEstimator, ClassifierMixin):

    def __init__(self, demo_param='demo'):
        self.demo_param = demo_param

    def fit(self, X, y):

        # Check that X and y have correct shape
        X, y = check_X_y(X, y)
        # Store the classes seen during fit
        self.classes_ = unique_labels(y)

        self.X_ = X
        self.y_ = y
        # Return the classifier
        return self

    def predict(self, X):

        # Check is fit had been called
        check_is_fitted(self, ['X_', 'y_'])

        # Input validation
        X = check_array(X)

        closest = np.argmin(euclidean_distances(X, self.X_), axis=1)
        return self.y_[closest]