In [None]:
import pandas as pd
import numpy as np

from datetime import datetime
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

import database as db
import request_category as rc



In [None]:
def get_data(*categories, all = False):
    queries = []
    for category in categories:
        cat_query = db.query_pages_by_category( category)
        
        queries.append( cat_query)
    if len( categories) > 1:
        pages_query = """SELECT b.category, b.subcategory, b.title, b.pageid, b.article
                 FROM (({}) UNION ({}) ) as b;""".format( queries[0], queries[1])
    else: 
        pages_query = cat_query + ";"
        
    #ml_query = db.query_pages_by_category( categories[0])  # 'machine learning'
    #bs_query = db.query_pages_by_category( categories[1])   # 'business software'
    #pages_query = """SELECT b.category, b.subcategory, b.title, b.pageid, b.article
    #             FROM (({}) UNION ({}) ) as b;""".format( ml_query, bs_query)

    pages_df = db.query_to_dataframe( pages_query)
    
    empty_mask =  pages_df.article == ''
    nonempty_pages_df = pages_df[~empty_mask].reset_index(drop = True).copy()
    nonempty_pages_df.index = nonempty_pages_df.pageid
    nonempty_pages_df.drop( ['pageid'], axis = 1, inplace = True)
    

    return nonempty_pages_df

In [None]:
pages_df = get_data( 'machine learning', 'business software')
pages_df.shape

In [None]:
X = pages_df['article'].copy()

encoder = LabelEncoder()
y = encoder.fit_transform( pages_df['subcategory'] )

X_train, X_test, y_train, y_test = train_test_split( X, y, random_state = 42)

In [None]:
lr_pipe = Pipeline([
        ('encoder', TfidfVectorizer(ngram_range = (1,2),
                                 min_df = 3, max_df = .9, 
                                 stop_words = 'english')),
        ('truncator',TruncatedSVD(n_components=700, random_state=42) ),
        ('model', LogisticRegression( random_state= 42, n_jobs=-1, solver = 'sag', multi_class = 'ovr' )) #  'multinomial'
    ])

lr_params = {
    'model__C':np.logspace(-3,3,7)
}

gs_lr_pipe = GridSearchCV(lr_pipe, param_grid=lr_params, cv=5, n_jobs=-1) # StratifiedShuffleSplit(n_splits=5)

gs_lr_pipe.fit(X_train,y_train)

In [None]:
article = 'Statistics'  # Palantir Technologies Saffron Technology
search_doc, pageid = rc.get_article(article)  
#search_doc

In [None]:
article = 'Statistics'  # Palantir Technologies Saffron Technology
search_doc, pageid = rc.get_article(article)  

predicted_probs = gs_lr_pipe.predict_proba( [search_doc])
predicted_probs = predicted_probs.reshape(-1,1)


probs_df = pd.DataFrame( predicted_probs, columns= ['P'])
probs_df.loc[:,'Category'] = probs_df.apply( lambda x: x.index )  #encoder.inverse_transform( int(

probs_df.loc[:,'Category'] = probs_df['Category'].apply( lambda x: encoder.inverse_transform( x) )
encoder.inverse_transform( gs_lr_pipe.predict( [search_doc] ) )[0]

In [None]:
article = 'Kernel'  # Palantir Technologies Saffron Technology
search_doc, pageid = rc.get_article(article)  

predicted_probs = gs_lr_pipe.predict_proba( [search_doc])
predicted_probs = predicted_probs.reshape(-1,1)


probs_df = pd.DataFrame( predicted_probs, columns= ['P'])
probs_df.loc[:,'Category'] = probs_df.apply( lambda x: x.index )  #encoder.inverse_transform( int(

probs_df.loc[:,'Category'] = probs_df['Category'].apply( lambda x: encoder.inverse_transform( x) )
encoder.inverse_transform( gs_lr_pipe.predict( [search_doc] ) )[0]

In [None]:
article = 'Palantir Technologies'  #  Saffron Technology
search_doc, pageid = rc.get_article(article)  

predicted_probs = gs_lr_pipe.predict_proba( [search_doc])
predicted_probs = predicted_probs.reshape(-1,1)


probs_df = pd.DataFrame( predicted_probs, columns= ['P'])
probs_df.loc[:,'Category'] = probs_df.apply( lambda x: x.index )  #encoder.inverse_transform( int(

probs_df.loc[:,'Category'] = probs_df['Category'].apply( lambda x: encoder.inverse_transform( x) )
encoder.inverse_transform( gs_lr_pipe.predict( [search_doc] ) )[0]

In [None]:
article = 'Saffron Technology'  #  
search_doc, pageid = rc.get_article(article)  

predicted_probs = gs_lr_pipe.predict_proba( [search_doc])
predicted_probs = predicted_probs.reshape(-1,1)


probs_df = pd.DataFrame( predicted_probs, columns= ['P'])
probs_df.loc[:,'Category'] = probs_df.apply( lambda x: x.index )  #encoder.inverse_transform( int(

probs_df.loc[:,'Category'] = probs_df['Category'].apply( lambda x: encoder.inverse_transform( x) )
encoder.inverse_transform( gs_lr_pipe.predict( [search_doc] ) )[0]

In [None]:
encoder.inverse_transform( gs_lr_pipe.predict( [search_doc] ) )[0]

In [None]:
probs_df.sort_values(by= 'P', ascending=False)

In [None]:
test_df = pd.DataFrame( y, columns=['y'])

In [None]:
test_df['y'].value_counts

In [None]:
len(pages_df['subcategory'].value_counts())

In [None]:
pages_df['subcategory'].value_counts()

In [None]:
subcats = pages_df['subcategory'].unique().tolist()
'business software' in subcats

In [None]:
len(subcats)

In [None]:
len(encoder.fit_transform( subcats))

In [None]:
encoder.inverse_transform(22)

In [None]:
n_closest

In [None]:
gs_lr_pipe.predict( [search_doc] )

In [None]:
encoder.inverse_transform( gs_lr_pipe.predict( [search_doc] ) )[0]

In [None]:
n = 5
n_closest = probs_df.sort_values(by = ['P'],ascending=False)[0:n]
predicted = encoder.inverse_transform( gs_lr_pipe.predict( [search_doc] ) )[0]

predicted, n_closest

In [None]:
predicted_probs = gs_lr_pipe.predict_proba( [search_doc])  ## 'Brain'
predicted_probs = predicted_probs.reshape(-1,1)

cats = encoder.inverse_transform(  range(63))

probs_df = pd.DataFrame( predicted_probs , columns= ['P']) #, index = cats)


n_closest = probs_df.sort_values(by = ['P'],ascending=False)[0:n]
predicted = encoder.inverse_transform( gs_lr_pipe.predict( [search_doc] ) )[0]

predicted, n_closest

In [None]:
def train():

    pages_df = search.get_data( 'machine learning', 'business software')
    
    X = pages_df['article'].copy()

    encoder = LabelEncoder()
    y = encoder.fit_transform( pages_df['subcategory'] )
    
    
    X_train, X_test, y_train, y_test = train_test_split( X, y, random_state = 42)
    
    lr_pipe = Pipeline([
        ('encoder', TfidfVectorizer(ngram_range = (1,2),
                                 min_df = 3, max_df = .9, 
                                 stop_words = 'english')),
        ('truncator',TruncatedSVD(n_components=700) ),
        ('model', LogisticRegression())
    ])

    lr_params = {
        'model__C':np.logspace(-3,3,7)
    }

    gs_lr_pipe = GridSearchCV(lr_pipe, param_grid=lr_params, cv=5) # StratifiedShuffleSplit(n_splits=5)

    gs_lr_pipe.fit(X_train,y_train)
  
    #gs_lr_pipe.best_score_
    
    sklearn.externals.joblib.dump(gs_lr_pipe, './pickles/LogitModel1.p')
    
    return gs_lr_pipe

In [None]:
def predict(article, pckle = False, n = 5 ):
    
    '''Pass article name as title, pageid, or url, Return top 5 predicted categories by default
        TRY: Slack (software),Saffron Technology, Brain, Statistics, TensorFlow, Tableau Software'''

    pages_df = search.get_data( 'machine learning', 'business software')
    
    X = pages_df['article'].copy()

    encoder = LabelEncoder()
    y = encoder.fit_transform( pages_df['subcategory'] )
    cats = encoder.inverse_transform(  range(0,(max(y) + 1) ) ) 
    

    if pckle:
        gs_lr_pipe = sklearn.externals.joblib.load('./pickles/LogitModel1.p')
        
    else:
        print( 'Training Model, please wait...')
        gs_lr_pipe = train()
    # Slack (software),Saffron Technology, Brain, Statistics, TensorFlow, Tableau Software
    
    
    if 'wiki/' in article:
        article = article.split('wiki/')[1]
    
    #else:
        
    search_doc, pageid = rc.get_article(article)  
    search_doc = rc.cleaner( search_doc)
        
        
    predicted_probs = gs_lr_pipe.predict_proba([search_doc])  ## 'Brain'
    predicted_probs = predicted_probs.reshape(-1,1)

    cats = encoder.inverse_transform(  range(63))

    probs_df = pd.DataFrame( predicted_probs , columns= ['P'], index = cats)
    
    
    n_closest = probs_df.sort_values(by = ['P'],ascending=False)[0:n]
    predicted = encoder.inverse_transform( gs_lr_pipe.predict( [search_doc] ) )[0]
    
    return predicted, n_closest
    
        
        
        
        
        
        
        
        

In [None]:
class TemplateClassifier(BaseEstimator, ClassifierMixin):

    def __init__(self, *categories, min_df=.05, max_df=.90, ngram_range=(1,3), n_components=100):
        #self.categories = categories
        self.min_df = min_df
        self.max_df = max_df
        self.ngram_range = ngram_range
        self.n_components = n_components

    def fit(self, X, y):
        
        # Check that X and y have correct shape
        
        
        
        X, y = check_X_y(X, y)
        
        
        # Store the classes seen during fit
        self.classes_ = unique_labels(y)

        self.X_ = X
        self.y_ = y
        # Return the classifier
        return self

    def predict(self, X):

        # Check is fit had been called
        check_is_fitted(self, ['X_', 'y_'])

        # Input validation
        X = check_array(X)

        closest = np.argmin(euclidean_distances(X, self.X_), axis=1)
        return self.y_[closest]