In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

%matplotlib inline

## Model 1: use all categories/articles in database

In [5]:
wiki_data = pd.read_pickle('wiki_data.p')

In [113]:
wiki_data.sample(5)

Unnamed: 0,_id,category,content,pageid,title,clean_content
3505,591c9dbdad28cdd96d819941,0,"ELT is an alternative to Extract, transform, l...",46363781,"Extract, load, transform",elt is an alternative to extract transform loa...
1964,591c94f3ad28cdd96d818902,3,Cow Lake is located in Grand Teton National Pa...,33934274,"Cow Lake (Teton County, Wyoming)",cow lake is located in grand teton national pa...
1665,591c9429ad28cdd96d818658,3,January 20: Six exporting countries - Abu Dhab...,4238002,1972 world oil market chronology,january six exporting countriesabu dhabi iran ...
1183,591c76e7ad28cdd96d817851,2,.xnk is a file extension used by Microsoft Out...,13215078,Xnk,xnk is a file extension used by microsoft out...
1006,591bb031ad28cdd96d815651,7,"Tom Michael Mitchell (born August 9, 1951) is ...",33275304,Tom M. Mitchell,tom michael mitchell born august is an america...


In [6]:
LSA = np.load('LSA.npy')

In [7]:
param_dict = {
    'lr':{
        'C': [10, 100, 10E3, 10E6, 10E9]
    },
    'rf':{},
    'knc':{
        'n_neighbors':range(2,30, 2)
    }
}

In [8]:
model_dict = {
    'lr':GridSearchCV(LogisticRegression(),
                             param_grid=param_dict['lr'],
                             cv=StratifiedShuffleSplit(n_splits=5, random_state=42)),
    'rf':GridSearchCV(RandomForestClassifier(),
                      param_grid=param_dict['rf'],
                      cv=StratifiedShuffleSplit(n_splits=5, random_state=42)),
    'knc':GridSearchCV(KNeighborsClassifier(),
                       param_grid=param_dict['knc'],
                       cv=StratifiedShuffleSplit(n_splits=5, random_state=42))
                        }

In [23]:
def fit_all_models(x,y, model_dict):
    for model in model_dict.keys():
        model_dict[model].fit(x,y)
        print("{:5} best score: {} \n    best estimator: {}".format(
            model, 
            model_dict[model].best_score_, 
            model_dict[model].best_estimator_))
        # df_model_{}.format(model) =  pd.DataFrame(model_dict[model].cv_results_).T
        # return df

In [24]:
fit_all_models(LSA, wiki_data['category'], model_dict)

lr    best score: 0.5319223985890652 
    best estimator: LogisticRegression(C=10000000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
rf    best score: 0.4821869488536155 
    best estimator: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
knc   best score: 0.5661375661375662 
    best estimator: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=28, p=2,
           weights='uniform')


## Model 2: use only two categories (business intelligence & machine learning)

In [114]:
wiki_data_two_cats = pd.read_pickle('wiki_data_two_cats.p')

In [115]:
wiki_data_two_cats.sample(5)

Unnamed: 0,_id,category,content,pageid,title,clean_content
3266,591c9cbfad28cdd96d81965c,0,The Baby Tooth Survey was initiated by the Gre...,30408720,Baby Tooth Survey,the baby tooth survey was initiated by the gre...
4321,591ca130ad28cdd96d81a389,0,"The World Programming System, also known as WP...",9406780,World Programming System,the world programming system also known as wps...
3536,591c9ddead28cdd96d8199a3,0,"Frank Yates FRS (May 12, 1902 – June 17, 1994)...",165733,Frank Yates,frank yates frs may june was one of the pionee...
4264,591ca0f6ad28cdd96d81a2d7,0,Envision BI is a modern cloud based business i...,36243725,User:Rakeshnandi1990/sandbox,envision bi is a modern cloud based business i...
503,591b9eb8ad28cdd96d814a89,1,Julie Beth Lovins is a computational linguist ...,31894251,Julie Beth Lovins,julie beth lovins is a computational linguist ...


In [11]:
LSA_two = np.load('LSA_two.npy')

In [101]:
fit_all_models(LSA_two, wiki_data_two_cats['category'], model_dict)

rf    best score: 0.879646017699115 
      best estimator: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
knc   best score: 0.9079646017699115 
      best estimator: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=8, p=2,
           weights='uniform')
lr    best score: 0.9061946902654867 
      best estimator: LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


#### Next steps?

extract most important words, visualization of top features, look at results using different categories (confusion matrix)

try with a real article. do predict and predictproba.
