In [1]:
import request_category as rc



In [2]:
import pandas as pd
import numpy as np

import datetime

In [3]:
cat_dict = rc.get_pages( 'machine learning', depth = 3)

## Verify Category Dictionary

In [4]:
print( cat_dict.keys())
print( len(cat_dict.keys()))

dict_keys(['machine learning', 'Applied machine learning', 'Artificial neural networks', 'Deep learning', 'Neural network software', 'Bayesian networks', 'Classification algorithms', 'Decision trees', 'Ensemble learning', 'Cluster analysis', 'Cluster analysis algorithms', 'Clustering criteria', 'Computational learning theory', 'Artificial intelligence conferences', 'Data mining and machine learning software', 'Social network analysis software', 'Datasets in machine learning', 'Datasets in computer vision', 'Dimension reduction', 'Factor analysis', 'Evolutionary algorithms', 'Gene expression programming', 'Genetic algorithms', 'Artificial immune systems', 'Genetic programming', 'Nature-inspired metaheuristics', 'Inductive logic programming', 'Kernel methods for machine learning', 'Support vector machines', 'Latent variable models', 'Structural equation models', 'Learning in computer vision', 'Log-linear models', 'Loss functions', 'Machine learning algorithms', 'Machine learning task', '

In [5]:
orig_cats = cat_dict['machine learning'].subcategory.unique().tolist()
len( cat_dict['machine learning'].subcategory.unique().tolist())

48

In [6]:
cat_dict['machine learning'].shape

(1311, 5)

In [7]:
print( 'Total Articles: {}'. format( cat_dict['machine learning'].subcategory.value_counts().sum()) )
print( '_'*100)
print( cat_dict['machine learning'].subcategory.value_counts() )

Total Articles: 1311
____________________________________________________________________________________________________
machine learning                             197
Artificial neural networks                   129
Data mining and machine learning software     87
Machine learning researchers                  82
Classification algorithms                     81
Machine learning algorithms                   57
Markov models                                 52
Genetic algorithms                            41
Evolutionary algorithms                       40
Dimension reduction                           39
Cluster analysis algorithms                   38
Statistical natural language processing       35
Applied machine learning                      35
Latent variable models                        25
Causal inference                              24
Neural network software                       22
Computational learning theory                 21
Deep learning                                

In [8]:
cat_dict['Artificial neural networks'].shape

(171, 5)

In [9]:
print( 'Total Articles: {}'. format( cat_dict['Artificial neural networks'].subcategory.value_counts().sum()) )
print( '_'*100)
print( cat_dict['Artificial neural networks'].subcategory.value_counts())


Total Articles: 171
____________________________________________________________________________________________________
Artificial neural networks    129
Neural network software        22
Deep learning                  20
Name: subcategory, dtype: int64


In [10]:
cat_dict['Deep learning'].subcategory.value_counts() 

Deep learning    20
Name: subcategory, dtype: int64

In [11]:
cat_dict['Neural network software'].subcategory.value_counts() 

Neural network software    22
Name: subcategory, dtype: int64

## Lets transform the dictionary of categories into a DataFrame

In [13]:
n_categories = len( cat_dict.keys()) 
n_categories

48

In [14]:
category_pages_df = pd.concat( cat_dict.values())
category_pages_df.shape

(3053, 5)

In [None]:
category_pages_df.head(15)

## Display Hierchical Structure of Category

In [15]:
print( 'Total titles: {}'.format(category_pages_df.drop( ['ns', 'pageid'], axis = 1).groupby( by = ['category', 'subcategory']).count().sum() ) )
category_pages_df.drop( ['ns', 'pageid'], axis = 1).groupby( by = ['category', 'subcategory']).count()

Total titles: title    3053
dtype: int64


Unnamed: 0_level_0,Unnamed: 1_level_0,title
category,subcategory,Unnamed: 2_level_1
Applied machine learning,Applied machine learning,35
Artificial immune systems,Artificial immune systems,4
Artificial intelligence conferences,Artificial intelligence conferences,14
Artificial neural networks,Artificial neural networks,129
Artificial neural networks,Deep learning,20
Artificial neural networks,Neural network software,22
Bayesian networks,Bayesian networks,12
Causal inference,Causal inference,24
Classification algorithms,Artificial neural networks,129
Classification algorithms,Classification algorithms,81


In [16]:
import requests
import psycopg2 as pg2
from psycopg2.extras import RealDictCursor
import wikipedia
from spacy.en import STOP_WORDS
from spacy.en import English
from datetime import datetime

nlp = English()
time_of_one_grab = .661189

In [17]:
def fill_unique_pages( category, depth = 3, grab = False):
    start = datetime.now()
    print('Gathering page information from Category: {}, pages from nested sub-categories (+{} levels) will be included as a union for each category. '.format( category, depth))
    category_dict = rc.get_pages( category, depth)  ## REMOVE rc if copying into script
    n_categories = len( category_dict.keys()) 
    print('\tTotal categories after recursive search: {}'.format( n_categories) )
    
    try:
        category_pages_df = pd.concat( category_dict.values())
    
        category_pages_df.drop('ns', axis = 1, inplace = True)
        category_pages_df.reset_index( drop = True, inplace = True)
        
        unique_pages_df = category_pages_df.drop_duplicates(subset = ['pageid', 'title']).copy() ## Drop members with the same pageid & title
        unique_pages_df.reset_index( drop = True, inplace = True)
        
        n_grabs = unique_pages_df.shape[0]
        #print( unique_pages_df.shape, n_grabs)
        
        estT = round(time_of_one_grab*n_grabs/60, 2)  ## minutes
        print('\tRequesting {} unique articles - ETA: {} minutes'.format( n_grabs, estT))
        if grab:
            unique_pages_df['article'] = unique_pages_df.pageid.apply( grab_content )  ## Grab content and clean it
            tag = ''
        else:
            tag = '(article excluded)'
        totT = round((datetime.now()-start).seconds/60,2) ## minutes
        print('\t\tPage collection {} took a total of {} minutes'.format(tag, totT) )
        return category_pages_df, unique_pages_df
    
    except:
        print( 'Nothing to fill.')

In [100]:
ml_category_pages_df, ml_unique_pages_df = fill_unique_pages( 'machine learning', depth = 3, grab = False)

Gathering page information from Category: machine learning, pages from nested sub-categories (+3 levels) will be included as a union for each category. 
	Total categories after recursive search: 48
	Requesting 1096 unique articles - ETA: 12.08 minutes
		Page collection (article excluded) took a total of 0.27 minutes


In [101]:
ml_category_pages_df.shape

(3055, 4)

In [20]:
ml_category_pages_df.head()

Unnamed: 0,category,pageid,subcategory,title
0,machine learning,43385931,machine learning,Data exploration
1,machine learning,49082762,machine learning,List of datasets for machine learning research
2,machine learning,233488,machine learning,Machine learning
3,machine learning,53587467,machine learning,Outline of machine learning
4,machine learning,3771060,machine learning,Accuracy paradox


In [21]:
print( 'Total titles: {}'.format(ml_category_pages_df.drop( ['pageid'], axis = 1).groupby( by = ['category', 'subcategory']).count().sum() ) )
ml_category_pages_df.drop( ['pageid'], axis = 1).groupby( by = ['category', 'subcategory']).count()

Total titles: title    3053
dtype: int64


Unnamed: 0_level_0,Unnamed: 1_level_0,title
category,subcategory,Unnamed: 2_level_1
Applied machine learning,Applied machine learning,35
Artificial immune systems,Artificial immune systems,4
Artificial intelligence conferences,Artificial intelligence conferences,14
Artificial neural networks,Artificial neural networks,129
Artificial neural networks,Deep learning,20
Artificial neural networks,Neural network software,22
Bayesian networks,Bayesian networks,12
Causal inference,Causal inference,24
Classification algorithms,Artificial neural networks,129
Classification algorithms,Classification algorithms,81


In [22]:
print( ml_unique_pages_df.shape)
ml_unique_pages_df.head()

(1095, 4)


Unnamed: 0,category,pageid,subcategory,title
0,machine learning,43385931,machine learning,Data exploration
1,machine learning,49082762,machine learning,List of datasets for machine learning research
2,machine learning,233488,machine learning,Machine learning
3,machine learning,53587467,machine learning,Outline of machine learning
4,machine learning,3771060,machine learning,Accuracy paradox


In [124]:
ml_unique_pages_df.pageid.apply()

Unnamed: 0,category,pageid,subcategory,title
0,machine learning,43385931,machine learning,Data exploration
1,machine learning,49082762,machine learning,List of datasets for machine learning research
2,machine learning,233488,machine learning,Machine learning
3,machine learning,53587467,machine learning,Outline of machine learning
4,machine learning,3771060,machine learning,Accuracy paradox
5,machine learning,43808044,machine learning,Action model learning
6,machine learning,28801798,machine learning,Active learning (machine learning)
7,machine learning,45049676,machine learning,Adversarial machine learning
8,machine learning,52642349,machine learning,AIVA
9,machine learning,30511763,machine learning,AIXI


In [125]:
t_sample = ml_unique_pages_df.loc[0, :]
t_sample.apply()

In [126]:
t_sample.pageid.apply( grab_content, args)

43385931

In [None]:
ml_unique_pages_df.loc[0,'article'] = 

In [131]:
ml_unique_pages_df.loc[0:5,'article'] = ml_unique_pages_df.loc[0:5, :].pageid.apply( rc.grab_content, clean = False )  ## Don't clean it yet  

In [133]:
sample_df = ml_unique_pages_df.head(5).copy()

In [140]:
pids = sample_df.pageid.tolist()
pids

[43385931, 49082762, 233488, 53587467, 3771060]

In [None]:
ml_unique_pages_df.loc[0:5,'article']

In [162]:
test_art = ml_unique_pages_df.loc[3,'article']

In [163]:
test_art

'The following outline is provided as an overview of and topical guide to machine learning:\nMachine learning – subfield of computer science (more particularly soft computing) that evolved from the study of pattern recognition and computational learning theory in artificial intelligence. In 1959, Arthur Samuel defined machine learning as a "Field of study that gives computers the ability to learn without being explicitly programmed". Machine learning explores the study and construction of algorithms that can learn from and make predictions on data. Such algorithms operate by building a model from an example training set of input observations in order to make data-driven predictions or decisions expressed as outputs, rather than following strictly static program instructions.\n\n\n== What type of thing is machine learning? ==\nAn academic discipline\nA branch of science\nAn applied science\nA subfield of computer science\nA branch of artificial intelligence\nA subfield of soft computing

In [165]:
#test_art.decode('unicode_escape').encode('ascii','ignore')
test_art = re.sub('[\n\t]',' ', test_art.lower())
test_art = re.sub('[\<\>\(\)\[\]\{\}\"]','', test_art.lower())
#test_art = re.sub('[=-+]',' ', test_art.lower())

test_art

"the following outline is provided as an overview of and topical guide to machine learning: machine learning – subfield of computer science more particularly soft computing that evolved from the study of pattern recognition and computational learning theory in artificial intelligence. in 1959, arthur samuel defined machine learning as a field of study that gives computers the ability to learn without being explicitly programmed. machine learning explores the study and construction of algorithms that can learn from and make predictions on data. such algorithms operate by building a model from an example training set of input observations in order to make data-driven predictions or decisions expressed as outputs, rather than following strictly static program instructions.   == what type of thing is machine learning? == an academic discipline a branch of science an applied science a subfield of computer science a branch of artificial intelligence a subfield of soft computing   == branches

In [None]:
get_article = lambda x: unique_pages_df[ unique_pages_df.pageid == x].article.tolist()[0]
category_pages_df.pageid.apply( get_article)

#category_pages_df.loc[:, 'article'] =

In [None]:
retrieve_article = lambda x: ml_unique_pages_df[ ml_unique_pages_df.pageid == x].article.tolist()[0]

In [115]:
row = ml_unique_pages_df[ ml_unique_pages_df.pageid == 43385931]

In [119]:
row.title.tolist()

['Data exploration']

In [23]:
## Union of pages in category includes 1 of each article
sum( ml_unique_pages_df.title.value_counts() > 1)

0

In [24]:
len(ml_category_pages_df.category.unique().tolist())

48

In [26]:
ml_category_pages_df.drop( ['pageid'], axis = 1).groupby( by = ['category', 'subcategory']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,title
category,subcategory,Unnamed: 2_level_1
Applied machine learning,Applied machine learning,35
Artificial immune systems,Artificial immune systems,4
Artificial intelligence conferences,Artificial intelligence conferences,14
Artificial neural networks,Artificial neural networks,129
Artificial neural networks,Deep learning,20
Artificial neural networks,Neural network software,22
Bayesian networks,Bayesian networks,12
Causal inference,Causal inference,24
Classification algorithms,Artificial neural networks,129
Classification algorithms,Classification algorithms,81


In [38]:
categories_df = ml_category_pages_df.drop_duplicates( subset = ['category']).reset_index( drop=True).copy()
print
categories_df[['category']].head()

#categories_df = pd.DataFrame({'category':ml_category_pages_df.subcategory.unique() }) 
#categories_df.head()

Unnamed: 0,category
0,machine learning
1,Applied machine learning
2,Classification algorithms
3,Artificial neural networks
4,Deep learning


In [161]:
sub_categories_df = ml_category_pages_df.drop(['pageid','title'], axis = 1).drop_duplicates( subset = ['category', 'subcategory']).reset_index( drop=True).copy()
print( sub_categories_df.shape)
sub_categories_df.head()



(130, 2)


Unnamed: 0,category,subcategory
0,machine learning,machine learning
1,machine learning,Applied machine learning
2,machine learning,Classification algorithms
3,machine learning,Artificial neural networks
4,machine learning,Deep learning


In [32]:
import request_cat as rc
import database as db
import pandas as pd
from datetime import datetime

In [None]:


def download( *category, depth, pickle = False):
    
    categories = list(*category)
    
    for cat in categories:
        start = datetime.now()
        if pickle:
            pages_df = pd.read_pickle('./docker/postgres/data/{}'.format( pickle) )
        else:
            category_pages_df, unique_pages_df = rc.fill_unique_pages(cat, depth = depth, grab = True)   ## REMOVE rc.
        n_pages = unique_pages_df.shape[0]
        print( "\t\t\tUpdating pages table:\n\t\t\t\tCollecting text for {} articles".format(n_pages) )
        results = unique_pages_df.apply( lambda x: db.update_page_table( x.pageid, x.title), axis = 1)
        #results = pages_df.apply( lambda x: db.update_page_table( x.pageid, x.title, x.text), axis = 1)
                    ##NOTICE: num_categories = num_sub_categories = union of categories and nested categories
        print( "\t\t\tUpdating categories table")
        categories_df = pd.DataFrame({'category':category_pages_df.subcategory.unique() }) 
        results = categories_df.apply( lambda x: db.update_category_table( x.category), axis = 1)

        print("\t\t\tUpdating subcategories table")
        subcategories_df = category_pages_df.drop_duplicates( subset = ['category', 'subcategory']).copy()
        results = subcategories_df.apply( lambda x: db.update_sub_category_table( x.subcategory, x.category), axis = 1) 
            
        print("\t\t\tUpdating page-category (link) table")
        subcat_page_df = category_pages_df.drop_duplicates( subset = ['subcategory', 'pageid']).copy()
        results = subcat_page_df.apply( lambda x: db.update_page_category_table( x.pageid, x.subcategory), axis = 1)
        
        timeduration = round((datetime.now()-start).seconds/60,2) ## minutes
        
        print("\t\t\tArticle colletion took a total of {} minutes.".format( cat, timeduration) )
        print("_"*75)
        
        





In [None]:
def update_category_table(category):
    query = """BEGIN;
               INSERT INTO categories (category) VALUES ('{}');
               COMMIT;""".format(category )
    #ALTER TABLE category category_id SERIAL PRIMARY KEY;
    query = re.sub( "\s+", " ", query)
    return query_to_dictionary( query, fetch_res = False)
    
def update_sub_category_table(subcategory, category):
    #clean_subcategory = re.sub('[^a-z0-9 ]',' ', subcategory.lower())
    query = """BEGIN;
               INSERT INTO subcategories (subcategory, category_id) 
               VALUES ('{}',(SELECT category_id FROM categories WHERE category = '{}'));
               COMMIT;""".format(subcategory, category )
    #ALTER TABLE category category_id SERIAL PRIMARY KEY;
    query = re.sub("\s+", " ", query)
    return query_to_dictionary( query, fetch_res = False)

### OLD
def update_page_category_table( pageid, subcategory):
    query = """BEGIN;
               INSERT INTO page_category (pageid, subcategory_id) 
               VALUES ({}, (SELECT subcategory_id FROM subcategories WHERE subcategory = '{}'));
               COMMIT;""".format( pageid, subcategory)
    query = re.sub("\s+", " ", query)
    return query_to_dictionary( query, fetch_res = False)

In [34]:
db.clear_table( 'pages' )
db.clear_table( 'categories' )
db.clear_table( 'subcategories' )
db.clear_table( 'page_category' )


In [35]:
db.query_to_dataframe('SELECT * FROM pages LIMIT 5;')

In [None]:
download( ['machine learning', 'business software'], depth = 1)

In [44]:
set(category_pages_df.category.unique()) == set(category_pages_df.subcategory.unique())

True

In [45]:
print( "\t\t\tUpdating categories table")

## NOTICE:  set(category_pages_df.category.unique()) == set(category_pages_df.subcategory.unique())
## Update categories table
categories_df = pd.DataFrame({'category':category_pages_df.subcategory.unique() }) 
results = categories_df.apply( lambda x: db.update_category_table( x.category), axis = 1)
#db.update_category_table( cat)

print("\t\t\tUpdating subcategories table")

subcategories_df = category_pages_df.drop_duplicates( subset = ['category', 'subcategory']).copy()
results = subcategories_df.apply( lambda x: db.update_sub_category_table( x.subcategory, x.category), axis = 1) 
#results = subcategories_df.apply( lambda x: db.update_sub_category_table( x.subcategory, cat), axis = 1) 

print("\t\t\tUpdating page-category (link) table")


results = category_pages_df.apply( lambda x: db.update_page_category_table( x.pageid, x.subcategory), axis = 1)

timeduration = round((datetime.now()-start).seconds/60,2) ## minutes

print("\t\t\tArticle colletion took a total of {} minutes.".format( cat, timeduration) )
print("_"*75)


			Updating categories table
			Updating subcategories table
			Updating page-category (link) table


ProgrammingError: ('more than one row returned by a subquery used as an expression\n', 'occurred at index 0')

In [69]:
import re
#db.clear_table( 'page_category' )
inner_query = '''SELECT stuff.category, stuff.subcategory, pc.pageid 
    FROM (SELECT category, subcategory, subcategory_id 
           FROM subcategories sc 
           JOIN categories c 
           ON sc.category_id = c.category_id ) as stuff 
    JOIN page_category pc 
    ON stuff.subcategory_id = pc.subcategory_id'''

query = '''SELECT crap.category, crap.subcategory, p.title, crap.pageid  
           FROM ({}) as crap
           JOIN pages p
           ON crap.pageid = p.pageid;'''.format( inner_query)
query = re.sub( '\s+',' ', query)
query


db.query_to_dataframe( query) #  'Select * from page_category;'

Unnamed: 0,category,pageid,subcategory,title
0,machine learning,43385931,machine learning,data exploration
1,machine learning,43385931,machine learning,data exploration
2,machine learning,49082762,machine learning,list of datasets for machine learning research
3,machine learning,49082762,machine learning,list of datasets for machine learning research
4,machine learning,233488,machine learning,machine learning
5,machine learning,233488,machine learning,machine learning
6,machine learning,53587467,machine learning,outline of machine learning
7,machine learning,53587467,machine learning,outline of machine learning
8,machine learning,3771060,machine learning,accuracy paradox
9,machine learning,3771060,machine learning,accuracy paradox


In [None]:
## Original
def update_page_category_table( pageid, subcategory):
    query = """BEGIN;
               INSERT INTO page_category (pageid, subcategory_id) 
               VALUES ({}, (SELECT subcategory_id FROM subcategories WHERE subcategory = '{}'));
               COMMIT;""".format( pageid, subcategory)
    query = re.sub("\s+", " ", query)
    return db.query_to_dictionary( query, fetch_res = False)


In [104]:

## New
def update_page_category_table( pageid, subcategory, category, display = False):
    query_for_category_id = """SELECT category_id
                            FROM categories
                            WHERE category = '{}'""".format( category)

    query_for_subcategory_id = """SELECT subcategory_id 
                              FROM subcategories
                              WHERE subcategory = '{}'
                              AND category_id = ({})""".format( subcategory, query_for_category_id)
    
    query = """BEGIN;
               INSERT INTO page_category (pageid, subcategory_id) 
               VALUES ({}, ({}));
               COMMIT;""".format( pageid, query_for_subcategory_id)
    ## VALUES ({}, (SELECT subcategory_id FROM subcategories WHERE subcategory = '{}'));
    query = re.sub("\s+", " ", query)
    if display:
        return query
    else:
        return db.query_to_dictionary( query, fetch_res = True)

In [105]:
##  machine learning	Applied machine learning	179
## machine learning Applied machine learning 15795950 Activity recognition
update_page_category_table( 15795950, 'Applied machine learning', 'machine learning', display = True)

ProgrammingError: no results to fetch

In [None]:
db.query_to_dictionary(fetch_res=False)

In [86]:
db.query_to_dataframe("SELECT c.category, subcategory, subcategory_id FROM subcategories sc JOIN categories c ON sc.category_id = c.category_id WHERE subcategory = '{}'".format('Applied machine learning' ))

Unnamed: 0,category,subcategory,subcategory_id
0,machine learning,Applied machine learning,179
1,Applied machine learning,Applied machine learning,226


In [None]:
## Next one to add
## Applied machine learning 15795950 Activity recognition

In [None]:
query_for_category_id_ = '''SELECT category_id
                            FROM categories
                            WHERE category = {}'''.format( category)

query_for_subcategory_id = """SELECT subcategory_id 
                              FROM subcategories
                              WHERE subcategory = '{}'
                              AND category_id = {}""".format( subcategory, query_for_category_id)
query_for_subcategory_id = re.sub("\s+", " ", query_for_subcategory_id)




In [None]:
Applied machine learning 15795950 Activity recognition
Applied machine learning 41916168 AlchemyAPI
Applied machine learning 55075082 BigDL
Applied machine learning 53631046 Caffe (software)

In [76]:
update_page_category_table( 15795950, 'Applied machine learning' )

ProgrammingError: more than one row returned by a subquery used as an expression


In [None]:
print("\t\t\tUpdating page-category (link) table")
subcat_page_df = category_pages_df.drop_duplicates( subset = ['subcategory', 'pageid']).copy()
results = subcat_page_df.apply( lambda x: db.update_page_category_table( x.pageid, x.subcategory), axis = 1)

In [95]:
db.clear_table( 'page_category' )

In [106]:
db.query_to_dataframe('SELECT * FROM page_category;')

Unnamed: 0,category_page_id,pageid,subcategory_id
0,6316,43385931,178
1,6317,43385931,178
2,6318,15795950,179


In [98]:
import database as db

In [99]:

# pageid, subcategory, category):
print("\t\t\tUpdating page-category (link) table")
subcat_page_df = ml_category_pages_df.drop_duplicates( subset = ['subcategory', 'pageid']).copy()
results = subcat_page_df.apply( lambda x: db.update_page_category_table( x.pageid, x.subcategory, x.category), axis = 1)

			Updating page-category (link) table


TypeError: ('update_page_category_table() takes 2 positional arguments but 3 were given', 'occurred at index 0')