In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
categories = {1: 'Society & Culture', 2: 'Science & Mathematics', 3: 'Health', 4: 'Education & Reference',
              5: 'Computers & Internet', 6: 'Sports', 7: 'Business & Finance', 8: 'Entertainment & Music',
              9: 'Family & Relationships', 10: 'Politics & Government'}

inv_map = {v: k for k, v in categories.items()}


In [3]:
#Load cleaned data and create training/test/validation split

data_train = pd.read_csv('data/yahoo_train_notnull_clean.csv')
data_test = pd.read_csv('data/yahoo_test_clean.csv')
data_train = data_train.sample(frac=.5, random_state=42)

X = data_train['text']
Y = data_train['class']

X_test = data_test['text']
y_test = data_test['class']

X_train, X_valid, y_train, y_valid = train_test_split(X, Y, test_size=0.2, random_state=42)
#X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


In [4]:
#Create feature vectors using BBoW representation of unigrams and bigrams
from sklearn.feature_extraction.text import CountVectorizer

max_vocab = 10000
bow_transformer = CountVectorizer(max_features=max_vocab, ngram_range=(1,2), binary=True ).fit(X_train.values.astype('U'))
X_train = bow_transformer.transform(X_train.values.astype('U'))
X_valid = bow_transformer.transform(X_valid.values.astype('U'))
X_test = bow_transformer.transform(X_test.values.astype('U'))

In [5]:
#Create splits for cross-validation
from sklearn.model_selection import GridSearchCV, PredefinedSplit

my_test_fold = []
for i in range(X_train.shape[0]):
    my_test_fold.append(-1)
for i in range(X_valid.shape[0]):
    my_test_fold.append(0)

fold = PredefinedSplit(test_fold=my_test_fold)
CV_split_T = np.append(y_train, y_valid)
CV_split_X = sp.vstack((X_train , X_valid)) 

In [6]:
#Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB

def MNB(parameters, X, T, fold) : 
    n_folds = 5
    mnb = MultinomialNB(fit_prior=True)
    mnb_cv = GridSearchCV(mnb, parameters, cv=fold, scoring="accuracy", refit=False)
    mnb_cv.fit(X, T)
    scores = mnb_cv.cv_results_['mean_test_score']
    params = mnb_cv.cv_results_['params']
#     print('scores:',scores)
#    print('params:',params)
    return mnb_cv.best_params_

#search for alpha value
def getHyperparamsMNB (values, scale) : 
    prevBest = 1000
    while (True) : 
        param_grid = {'alpha': values}
        bestParams = MNB(param_grid, CV_split_X, CV_split_T, fold)
        curBest = bestParams['alpha']
        if (abs(curBest - prevBest ) > .00001) : 
            inc = curBest/10
            lb = curBest - inc*scale
            ub = curBest + inc*scale
            values = np.arange(lb, ub, inc)
            prevBest = curBest
        else : return curBest

#predict on test set
alphaVal = getHyperparamsMNB([ 0.0001, 0.001, 0.01, 0.1, 1, 10, 100], 5)            
nb = MultinomialNB(alpha=alphaVal, fit_prior=True)
nb.fit(X_train, y_train)
#print('best alpha: ', alphaVal)

#predict on train set
y_hat_train = nb.predict(X_train)
print('Training accuracy: ', accuracy_score(y_train, y_hat_train)  )

#predict on test set
y_hat_test = nb.predict(X_test)
print('Test accuracy: ', accuracy_score(y_test, y_hat_test)  )

Training accuracy:  0.69809552629267
Test accuracy:  0.6425833333333333


In [7]:
#Linear SVM
from sklearn.svm import LinearSVC

def linSVM(parameters, X, T, fold) : 
    n_folds = 5
    svc = LinearSVC(multi_class='ovr', max_iter = 2000, dual=False)
    svc_cv = GridSearchCV(svc, parameters, cv=fold, scoring="accuracy", refit=False)
    svc_cv.fit(X, T)
    scores = svc_cv.cv_results_['mean_test_score']
    params = svc_cv.cv_results_['params']
#    print('scores:',scores)
#    print('params:',params)
    return svc_cv.best_params_

#search for C value
def getHyperparamsSVM (values, scale) : 
    prevBest = 1000
    while (True) : 
        param_grid = {'C': values}
        bestParams = linSVM(param_grid, CV_split_X, CV_split_T, fold)
        curBest = bestParams['C']
        if (abs(curBest - prevBest ) > .0001) : 
            inc = curBest/10
            lb = curBest - inc*scale
            ub = curBest + inc*scale
            values = np.arange(lb, ub, inc)
            prevBest = curBest
        else : return curBest

#predict on test set
cVal = getHyperparamsSVM([ 0.0001, 0.001, 0.01, 0.1, 1, 10, 100], 5)            
svm = LinearSVC(C=cVal, multi_class='ovr')
svm.fit(X_train, y_train)
#print('best C: ', cVal)

#predict on train set
y_hat_train = svm.predict(X_train)
print('Training accuracy: ', accuracy_score(y_train, y_hat_train)  )

#predict on test set
y_hat_test = svm.predict(X_test)
print('Test accuracy: ', accuracy_score(y_test, y_hat_test)  )

Training accuracy:  0.7452306811267404
Test accuracy:  0.6528166666666667


In [8]:
from sklearn.linear_model import LogisticRegression

def logReg(parameters, X, T, fold) : 
    n_folds = 5
    logreg = LogisticRegression(multi_class='ovr', dual=False)
    logreg_cv = GridSearchCV(logreg, parameters, cv=fold, scoring="accuracy", refit=False)
    logreg_cv.fit(X, T) 
    scores = logreg_cv.cv_results_['mean_test_score']
    params = logreg_cv.cv_results_['params']
#    print('scores:',scores)
#    print('params:',params)
    return logreg_cv.best_params_

#search for C value
def getHyperparamsLogReg (values, scale) : 
    prevBest = 1000
    while (True) : 
        param_grid = {'C': values}
        bestParams = logReg(param_grid, CV_split_X, CV_split_T, fold)
        curBest = bestParams['C']
        if (abs(curBest - prevBest ) > .0001) : 
            inc = curBest/10
            lb = curBest - inc*scale
            ub = curBest + inc*scale
            values = np.arange(lb, ub, inc)
            prevBest = curBest
        else : return curBest

#predict on test set
cVal = getHyperparamsLogReg([ 0.0001, 0.001, 0.01, 0.1, 1, 10, 100], 5)            
logreg = LogisticRegression(C=cVal, multi_class='ovr', dual=False)
logreg.fit(X_train, y_train)
#print('best C: ', cVal)

#predict on train set
y_hat_train = logreg.predict(X_train)
print('Training accuracy: ', accuracy_score(y_train, y_hat_train)  )

#predict on test set
y_hat_test = logreg.predict(X_test)
print('Test accuracy: ', accuracy_score(y_test, y_hat_test)  )



Training accuracy:  0.7507296740626495
Test accuracy:  0.6414833333333333
