In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

#### Load data and create training/validation split


In [2]:
#Load cleaned data and create training/test/validation split

data_train = pd.read_csv('data/yahoo_train_clean.csv')
data_test = pd.read_csv('data/yahoo_test_clean.csv')
data_train = data_train.sample(frac=.3, random_state=42)

X = data_train['text']
Y = data_train['class']

X_test_text = data_test['text']
y_test = data_test['class']

X_train_text, X_valid_text, y_train, y_valid = train_test_split(X, Y, test_size=0.2, random_state=42)
max_vocab = 10000

#### Binary bag-of-words representation  (unigrams and bigrams)

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

def BBoW(X_train, X_valid, X_test) : 
    bow_transformer = CountVectorizer(max_features=max_vocab, ngram_range=(1,2), binary=True ).fit(X_train.values.astype('U'))
    X_train = bow_transformer.transform(X_train.values.astype('U'))
    X_valid = bow_transformer.transform(X_valid.values.astype('U'))
    X_test = bow_transformer.transform(X_test.values.astype('U'))
    return X_train, X_valid, X_test

#### Tf-idf representation (unigrams and bigrams)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf(X_train, X_valid, X_test) : 
    tfidf_transformer = TfidfVectorizer(max_features=max_vocab, ngram_range=(1,2)).fit(X_train.values.astype('U'))
    X_train = tfidf_transformer.transform(X_train.values.astype('U'))
    X_valid = tfidf_transformer.transform(X_valid.values.astype('U'))
    X_test = tfidf_transformer.transform(X_test.values.astype('U'))
    return X_train, X_valid, X_test 

#### Create splits for cross-validation

In [5]:
from sklearn.model_selection import GridSearchCV, PredefinedSplit

def cv_split(X_train, X_valid, y_train, y_valid) : 
    my_test_fold = []
    for i in range(X_train.shape[0]):
        my_test_fold.append(-1)
    for i in range(X_valid.shape[0]):
        my_test_fold.append(0)

    fold = PredefinedSplit(test_fold=my_test_fold)
    Y_cv = np.append(y_train, y_valid)
    X_cv = sp.vstack((X_train , X_valid)) 
    return X_cv, Y_cv, fold
    

### Multinomial Naive Bayes

In [6]:
#Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB

#5-fold cross-validation
def MNB_cv(parameters, X, T, fold) : 
    n_folds = 5
    mnb = MultinomialNB(fit_prior=True)
    mnb_cv = GridSearchCV(mnb, parameters, cv=fold, scoring="accuracy", refit=False)
    mnb_cv.fit(X, T)
    scores = mnb_cv.cv_results_['mean_test_score']
    params = mnb_cv.cv_results_['params']
#     print('scores:',scores)
#    print('params:',params)
    return mnb_cv.best_params_

#search for alpha value
def MNB_get_hyperparameters(values, scale, X_cv_split, Y_cv_split, fold) : 
    prevBest = 1000
    while (True) : 
        param_grid = {'alpha': values}
        bestParams = MNB_cv(param_grid, X_cv_split, Y_cv_split, fold)
        curBest = bestParams['alpha']
        if (abs(curBest - prevBest ) > .00001) : 
            inc = curBest/10
            lb = curBest - inc*scale
            ub = curBest + inc*scale
            values = np.arange(lb, ub, inc)
            prevBest = curBest
        else : return curBest


def MNB_predict(alpha, X_train, X_test, y_train, y_test) :
    nb = MultinomialNB(alpha=alpha, fit_prior=True)
    nb.fit(X_train, y_train)
    #print('best alpha: ', alphaVal)
   
    #predict on train set
    y_hat_train = nb.predict(X_train)
    print('\t Training accuracy: \t', accuracy_score(y_train, y_hat_train)  )

    #predict on test set
    y_hat_test = nb.predict(X_test)
    print('\t Test accuracy: \t', accuracy_score(y_test, y_hat_test)  )

### Linear SVM


In [7]:
#Linear SVM
from sklearn.svm import LinearSVC

#5-fold cross-validation
def linSVM_cv(parameters, X, T, fold) : 
    n_folds = 5
    svc = LinearSVC(multi_class='ovr', max_iter = 2000, dual=False)
    svc_cv = GridSearchCV(svc, parameters, cv=fold, scoring="accuracy", refit=False)
    svc_cv.fit(X, T)
    scores = svc_cv.cv_results_['mean_test_score']
    params = svc_cv.cv_results_['params']
#    print('scores:',scores)
#    print('params:',params)
    return svc_cv.best_params_

#search for C value
def SVM_get_hyperparameters(values, scale, X_cv_split, Y_cv_split, fold) : 
    prevBest = 1000
    while (True) : 
        param_grid = {'C': values}
        bestParams = linSVM_cv(param_grid, X_cv_split, Y_cv_split, fold)
        curBest = bestParams['C']
        if (abs(curBest - prevBest ) > .0001) : 
            inc = curBest/10
            lb = curBest - inc*scale
            ub = curBest + inc*scale
            values = np.arange(lb, ub, inc)
            prevBest = curBest
        else : return curBest

def SVM_predict(C, X_train, X_test, y_train, y_test) :
    
    svm = LinearSVC(C=C, multi_class='ovr')
    svm.fit(X_train, y_train)
    #print('best C: ', cVal)

    #predict on train set
    y_hat_train = svm.predict(X_train)
    print('\t Training accuracy: \t', accuracy_score(y_train, y_hat_train)  )

    #predict on test set
    y_hat_test = svm.predict(X_test)
    print('\t Test accuracy: \t', accuracy_score(y_test, y_hat_test)  )

### Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression

#5-fold cross-validation
def LogReg_cv(parameters, X, T, fold) : 
    n_folds = 5
    logreg = LogisticRegression(multi_class='ovr', solver='liblinear', max_iter = 4000, dual=False)
    logreg_cv = GridSearchCV(logreg, parameters, cv=fold, scoring="accuracy", refit=False)
    logreg_cv.fit(X, T) 
    scores = logreg_cv.cv_results_['mean_test_score']
    params = logreg_cv.cv_results_['params']
#    print('scores:',scores)
#    print('params:',params)
    return logreg_cv.best_params_

#search for C value
def LogReg_get_hyperparameters(values, scale, X_cv_split, Y_cv_split, fold) : 
    prevBest = 1000
    while (True) : 
        param_grid = {'C': values}
        bestParams = LogReg_cv(param_grid, X_cv_split, Y_cv_split, fold)
        curBest = bestParams['C']
        if (abs(curBest - prevBest ) > .0001) : 
            inc = curBest/10
            lb = curBest - inc*scale
            ub = curBest + inc*scale
            values = np.arange(lb, ub, inc)
            prevBest = curBest
        else : return curBest

        
def LogReg_predict(C, X_train, X_test, y_train, y_test) :     
    logreg = LogisticRegression(C=C, multi_class='ovr', solver='lbfgs',  max_iter = 4000, dual=False)
    logreg.fit(X_train, y_train)
    #print('best C: ', cVal)

    #predict on train set
    y_hat_train = logreg.predict(X_train)
    print('\t Training accuracy: \t', accuracy_score(y_train, y_hat_train)  )

    #predict on test set
    y_hat_test = logreg.predict(X_test)
    print('\t Test accuracy: \t', accuracy_score(y_test, y_hat_test)  )

## BBoW - Predictions

In [9]:
X_train, X_valid, X_test = BBoW(X_train_text, X_valid_text, X_test_text)
X_cv_split, Y_cv_split, fold = cv_split(X_train, X_valid, y_train, y_valid)

values = [ 0.0001, 0.001, 0.01, 0.1, 1, 10]

#### Multinomial Naive Bayes

In [10]:
alpha = MNB_get_hyperparameters(values, 5, X_cv_split, Y_cv_split, fold) 

MNB_predict(alpha, X_train, X_test, y_train, y_test) 

	 Training accuracy: 	 0.6632232142857143
	 Test accuracy: 	 0.6565833333333333


#### Linear SVM

In [11]:
C = SVM_get_hyperparameters(values, 5, X_cv_split, Y_cv_split, fold) 

SVM_predict(C, X_train, X_test, y_train, y_test) 

	 Training accuracy: 	 0.7096875
	 Test accuracy: 	 0.68335


#### Logistic Regression

In [17]:
C = LogReg_get_hyperparameters(values, 5, X_cv_split, Y_cv_split, fold) 

LogReg_predict(C, X_train, X_test, y_train, y_test) 

	 Training accuracy: 	 0.7091160714285715
	 Test accuracy: 	 0.6806


## tf-idf - Predictions

In [19]:
X_train, X_valid, X_test = tfidf(X_train_text, X_valid_text, X_test_text)
X_cv_split, Y_cv_split, fold = cv_split(X_train, X_valid, y_train, y_valid)

values = [ 0.0001, 0.001, 0.01, 0.1, 1, 10]

#### Multinomial Naive Bayes

In [20]:
alpha = MNB_get_hyperparameters(values, 5, X_cv_split, Y_cv_split, fold) 

MNB_predict(alpha, X_train, X_test, y_train, y_test) 

	 Training accuracy: 	 0.6802470238095238
	 Test accuracy: 	 0.6735


#### Linear SVM

In [21]:
C = SVM_get_hyperparameters(values, 5, X_cv_split, Y_cv_split, fold) 

SVM_predict(C, X_train, X_test, y_train, y_test) 

	 Training accuracy: 	 0.7234642857142857
	 Test accuracy: 	 0.6954


#### Logistic Regression

In [22]:
C = LogReg_get_hyperparameters(values, 5, X_cv_split, Y_cv_split, fold) 

LogReg_predict(C, X_train, X_test, y_train, y_test)

	 Training accuracy: 	 0.7158214285714286
	 Test accuracy: 	 0.69335
