In [1]:
import request_category as rc
import database as db
from download import download

import pickle
from pathlib import Path
import re

import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt

import sklearn.externals.joblib
%matplotlib inline

In [None]:
ml_query = db.query_pages_by_category( 'machine learning')
bs_query = db.query_pages_by_category( 'business software')

In [None]:
pages_query = """SELECT b.category, b.subcategory, b.title, b.pageid, b.article
                 FROM (({}) UNION ({}) ) as b;""".format( ml_query, bs_query)

pages_df = db.query_to_dataframe( pages_query)
pages_df.shape  # shape = (2425, 3) Before adding category and subcategory
                ## shape = (2768, 3) after adding Distinct category and subcategory in outer query


In [3]:
## Machine learning - Grab all the unique articles with title and pageid

inner_ml = """SELECT stuff.category, stuff.subcategory, pc.pageid 
              FROM ( SELECT category, subcategory, subcategory_id 
                     FROM subcategories sc 
                         JOIN categories c 
                         ON sc.category_id = c.category_id 
                     WHERE category ='machine learning' ) as stuff 
              JOIN page_category pc 
              ON stuff.subcategory_id = pc.subcategory_id             
              """  #               HAVING COUNT(pc.pageid) = 1 ,        

ml_pages_query = """SELECT crap_ml.category, crap_ml.subcategory, p.pageid, p.title, p.article
                    FROM ({}) as crap_ml
                    JOIN pages p
                    ON crap_ml.pageid = p.pageid
                    
                """.format( inner_ml) # LIMIT 1000  ## GROUP BY crap_ml.category, crap_ml.subcategory, p.pageid
                    #HAVING COUNT( crap_ml.pageid) = 1, DISTINCT,  crap_ml.subcategory,
ml_pages_query = re.sub( "\s+"," ", ml_pages_query)
ml_pages_query


"SELECT crap_ml.category, crap_ml.subcategory, p.pageid, p.title, p.article FROM (SELECT stuff.category, stuff.subcategory, pc.pageid FROM ( SELECT category, subcategory, subcategory_id FROM subcategories sc JOIN categories c ON sc.category_id = c.category_id WHERE category ='machine learning' ) as stuff JOIN page_category pc ON stuff.subcategory_id = pc.subcategory_id ) as crap_ml JOIN pages p ON crap_ml.pageid = p.pageid "

In [4]:
## Business Software - Grab all the unique articles with title and pageid
#inner_bs = db.category_query( ('\'business software\''))
inner_bs = """SELECT stuff.category, stuff.subcategory, pc.pageid 
                        FROM ( SELECT category, subcategory, subcategory_id 
                               FROM subcategories sc 
                                   JOIN categories c 
                                   ON sc.category_id = c.category_id 
                               WHERE category ='business software' ) as stuff 
                        JOIN page_category pc 
                        ON stuff.subcategory_id = pc.subcategory_id
                        """   #  GROUP BY stuff.category, stuff.subcategory, pc.pageid

bs_pages_query = """SELECT crap_bs.category, crap_bs.subcategory, p.pageid, p.title, p.article
                    FROM ({}) as crap_bs
                    JOIN pages p
                    ON crap_bs.pageid = p.pageid
                    
                 """.format(inner_bs)  ## DISTINCT, LIMIT 1000, HAVING COUNT(crap_bs.pageid) = 1, crap_bs.category, crap_bs.subcategory,
bs_pages_query = re.sub( "\s+"," ", bs_pages_query)
bs_pages_query

"SELECT crap_bs.category, crap_bs.subcategory, p.pageid, p.title, p.article FROM (SELECT stuff.category, stuff.subcategory, pc.pageid FROM ( SELECT category, subcategory, subcategory_id FROM subcategories sc JOIN categories c ON sc.category_id = c.category_id WHERE category ='business software' ) as stuff JOIN page_category pc ON stuff.subcategory_id = pc.subcategory_id ) as crap_bs JOIN pages p ON crap_bs.pageid = p.pageid "

In [5]:
pages_query = """SELECT b.category, b.subcategory, b.title, b.pageid, b.article
                 FROM (({}) UNION ({}) ) as b;""".format( ml_pages_query, bs_pages_query)

pages_df = db.query_to_dataframe( pages_query)
pages_df.shape  # shape = (2425, 3) Before adding category and subcategory
                ## shape = (2768, 3) after adding Distinct category and subcategory in outer query


(2768, 5)

In [6]:
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelBinarizer, LabelEncoder

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

from sklearn.decomposition import TruncatedSVD, NMF

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression

In [7]:
X = pages_df['article'].copy()

encoder = LabelEncoder()
y = encoder.fit_transform( pages_df['category'] )
X_train, X_test, y_train, y_test = train_test_split( X, y, random_state = 42)

## Logistic Regression

In [None]:
gs_lr_pipe = sklearn.externals.joblib.load( './LogitModel1.p')

In [None]:
gs_lr_pipe.score( X_train, y_train)

In [None]:
gs_lr_pipe.score( X_test, y_test)

In [None]:
search_doc, pageid = rc.get_article('Brain')
search_doc = rc.cleaner( search_doc)
##print( pageid )
search_doc[0:200]

In [None]:
gs_lr_pipe.predict_proba([search_doc])  ## 'Brain'

In [None]:
gs_lr_pipe.predict_proba([search_doc])  ## 'Saffton Technology'

In [None]:
pages_df.head()

In [None]:
nb_pipe = Pipeline([
    #('encoder', CountVectorizer()),
    
    ('encoder', TfidfVectorizer(ngram_range = (1,2),
                             min_df = 3, max_df = .9, 
                             stop_words = 'english')),
    #('truncator',TruncatedSVD(n_components=700) ),
    ('truncator',NMF(n_components=300, random_state=42) ),
    ('model', MultinomialNB())
])

nb_params = {
    'model__alpha':np.linspace(.1,1,2)
}

gs_nb_pipe = GridSearchCV(nb_pipe, param_grid=nb_params, cv=5) ## StratifiedShuffleSplit(n_splits=5)

gs_nb_pipe.fit(X_train,y_train)

gs_nb_pipe.best_score_

In [None]:
gs_nb_pipe.score( X_train, y_train)

In [None]:
gs_nb_pipe.score( X_test, y_test)

In [None]:
article_text, pageid = rc.get_article( 'Saffron Technology')
article_te