# Logistic Regression Classifiers Amazon Review Summaries

In this notebook we will apply `LogisticRegression` to a binary classficiation problem
problem.

As the bag of words representation of a document is high dimensional we will use $\chi^2$ feature selection to select a subspace of a manageable dimension. 


## Preliminaries

### Imports

In [1]:
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

from sklearn import metrics
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split,KFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# for a progress bar on slow calculations
from ipywidgets import IntProgress
from IPython.display import display

import sys
sys.path.append("../..")
from E4525_ML import plots
import E4525_ML.text as text
from E4525_ML.multiclass_logistic import LogisticGDClassifier
%matplotlib inline
plt.rcParams['figure.figsize'] = (8.0, 6.0) # set default size of plots

In [2]:
seed=23
np.random.seed(seed)

### Data

#### Data Directories

In [3]:
data_dir=r"../../raw/amazon-reviews"

#### Corpus

In [4]:
documents_filename=data_dir+"/documents.csv"
documents=pd.read_csv(documents_filename)
documents["Summary"].fillna("",inplace=True)
documents.head()

Unnamed: 0,Summary,Score
0,Nice,5
1,No taste or flavor,2
2,spicy thai chips,5
3,I'm Loven The Mount Hagen!,5
4,Bigelow Tea Chest,2


In [5]:
test_documents_filename=data_dir+"/documents_test.csv"
test_documents=pd.read_csv(test_documents_filename)
test_documents["Summary"].fillna("",inplace=True)
test_documents.head()

Unnamed: 0,Summary,Score
0,Huge disappointment,1
1,In lieu of the now bastardized American black ...,3
2,Wow.,5
3,Tastes just like the store,5
4,a very handi snack,5



## Validation Set Creation

In [6]:
docs_train,docs_val,label_train,label_val=train_test_split(documents["Summary"],documents["Score"],test_size=0.2)

In [7]:
Y_train=label_train>3
Y_val=label_val>3

## Generic Text Classifier with Feature Selection

Selection $F$ features base of a $\chi^2$ test makes sense for any classifier.

For convenience we create a new wrapper classifier that selects $F$ features and calls the underlying
classier with them.

In [8]:
class TextChi2Classifier:
    def __init__(self,F,model):
        self.model=model
        self.transformer=SelectKBest(chi2,F)
    def fit(self,X,Y,X_val=None,Y_val=None):
        dummies=pd.get_dummies(Y,prefix="",prefix_sep="",sparse=True)
        Z=dummies
        Xt=self.transformer.fit_transform(X,Z)
        Xt_val=None
        if (not (X_val is None)):
            Xt_val=self.transformer.transform(X_val)
        self.model.fit(Xt,Y,Xt_val,Y_val)
        return self
    def predict(self,X):
        Xt=self.transformer.transform(X)
        return self.model.predict(Xt)
    def predict_proba(self,X):
        Xt=self.transformer.transform(X)
        return self.model.predict_proba(Xt)

In [9]:
countVectorizer=CountVectorizer(input="content")

In [10]:
X=countVectorizer.fit_transform(docs_train)
X_val=countVectorizer.transform(docs_val)
Y=Y_train

In [11]:
V=X.shape[1]
V

29084

In [12]:
base_model=LogisticGDClassifier(max_iter=100, learning_rate=0.005,tol=1e-8)

In [13]:
F=1000

In [None]:
modelT=TextChi2Classifier(F,base_model)

In [None]:
modelT.fit(X,Y,X_val,Y_val)

	 0 Loss = 694.3814033060282 Train_Accuracy 0.486 Evaluation Loss = 695.5799487147383 Accuracy = 0.468
	 10 Loss = 291.3810585812772 Train_Accuracy 0.886 Evaluation Loss = 314.86817904286573 Accuracy = 0.866
	 20 Loss = 266.37244396181256 Train_Accuracy 0.904 Evaluation Loss = 321.65899448925563 Accuracy = 0.861
	 30 Loss = 292.5577553480849 Train_Accuracy 0.886 Evaluation Loss = 307.31609740925273 Accuracy = 0.883
	 40 Loss = 296.2300938608464 Train_Accuracy 0.881 Evaluation Loss = 315.6748974738257 Accuracy = 0.865
	 50 Loss = 292.3599124065488 Train_Accuracy 0.883 Evaluation Loss = 292.4774083290627 Accuracy = 0.883
	 60 Loss = 307.25681602989175 Train_Accuracy 0.869 Evaluation Loss = 299.4299727273892 Accuracy = 0.877
	 70 Loss = 287.4125703584782 Train_Accuracy 0.883 Evaluation Loss = 306.7181224645605 Accuracy = 0.873
	 80 Loss = 309.179228609953 Train_Accuracy 0.869 Evaluation Loss = 304.4187182424062 Accuracy = 0.879
	 90 Loss = 273.85731570380875 Train_Accuracy 0.881 Evaluatio

In [None]:
Y_pred=modelT.predict(X_val)
np.mean(Y_pred==Y_val)

In [None]:
prob=modelT.predict_proba(X_val)
fpr,tpr,threshold=metrics.roc_curve(Y_val,prob[:,1])

In [None]:
auc=metrics.roc_auc_score(Y_val,prob[:,1])
print(auc)

 ##  LogisticRegression Classifier vs Number of Features

In [None]:
Fs=[100,200,500,1000,5000,10000,20000,V]

In [None]:
# Display a progress bar that counts how many training rounds we have done so far
progress2= IntProgress(min=0, max=len(Fs),description="Trials:",bar_style="info")
display(progress2)


logistic_train_error=[]
logistic_val_error=[]
progress2.value=0
for F in Fs:
    model=TextChi2Classifier(F,base_model)
    model.fit(X,Y,X_val,Y_val)
    Y_pred=model.predict(X)
    logistic_train_error.append(1-np.mean(Y_pred==Y))
    Y_pred=model.predict(X_val)
    logistic_val_error.append(1-np.mean(Y_pred==Y_val))
    progress2.value+=1
print("Done.")

In [None]:
plt.plot(Fs,logistic_train_error,label="Train")
plt.plot(Fs,logistic_val_error,label="Valuation")
plt.legend()
plt.title("Logistic Classifier Error vs Number Features")
plt.xlabel("Features")
plt.ylabel("Classification Error")
plt.ylim(0,0.15)

In [None]:
logistic_best_idx=np.argmin(logistic_val_error)
logistic_best_idx,Fs[logistic_best_idx],logistic_val_error[logistic_best_idx]

1. The model is generalizing pretty well (91% train accuracy versus 89% validation accuracy) 
2. Test Error stabilizes after roughtly 5k features
3. Logistic Regression performance does not degrade after we keep increasing the number of features

Best Accuracy is $\approx 89\%$, *slightly* better than Naive Bayes

## Regularization of the Logistic Regression Classifier

In [None]:
F_best=Fs[logistic_best_idx]
F_best

In [None]:
Cs=[0.0001,0.001,0.01,0.1,1,10,100,1000,10000,1e10]

In [None]:
# Display a progress bar that counts how many training rounds we have done so far
progress3= IntProgress(min=0, max=len(Cs),description="Trials:",bar_style="info")
display(progress3)

logistic_val_error=[]
progress3.value=0
for C in Cs:
    base_model=LogisticGDClassifier(penalty=1/C,max_iter=100, learning_rate=0.005,tol=1e-8)
    model=TextChi2Classifier(F_best,base_model)
    model.fit(X,Y,X_val,Y_val)
    Y_pred=model.predict(X_val)
    acc=np.mean(Y_pred==Y_val)
    logistic_val_error.append(1-acc)
    print(C,1-acc)
    progress3.value+=1
print("Done.")

In [None]:
plt.semilogx(1/np.array(Cs),logistic_val_error)

In [None]:
logistic_val_error=np.array(logistic_val_error)
print("C,error")
for idx,C in enumerate(Cs):
    print(C,logistic_val_error[idx])

In [None]:
best_idx=logistic_val_error.argmin()
best_C=Cs[best_idx]
print(best_idx,best_C,logistic_val_error[best_idx])

## Test best model

Best model is logistic regression, with C=10 used on all features

In [None]:
X=countVectorizer.fit_transform(documents["Summary"])
X_test=countVectorizer.transform(test_documents["Summary"])


In [None]:
Y=documents["Score"]>3
Y_test=test_documents["Score"]>3

In [None]:
base_model=LogisticGDClassifier(penalty=1/best_C,max_iter=100, learning_rate=0.005,tol=1e-8)
model=TextChi2Classifier(F_best,base_model)

In [None]:
model.fit(X,Y,X_test,Y_test)

In [None]:
Y_pred=model.predict(X_test)
np.mean(Y_pred==Y_test)

## ROC Curve

In [None]:

prob=model.predict_proba(X_test)
fpr,tpr,threshold=metrics.roc_curve(Y_test,prob[:,1])
roc=np.c_[fpr,tpr,threshold]
data=pd.DataFrame(roc,columns=["fpr","tpr","threshold"])
data.to_csv(data_dir+"/logistic_roc.csv",index=False)


In [None]:
auc=metrics.roc_auc_score(Y_test,prob[:,1])
print(auc)

In [None]:
ht=np.argmin((threshold-0.5)**2)
print(ht,threshold[ht])
print(fpr[ht],tpr[ht])
print("Accuracy",(1-fpr[ht])*np.mean(Y_test==0)+tpr[ht]*np.mean(Y_test==1))

In [None]:
set_roc=pd.read_csv(data_dir+"/set_roc.csv")
set_roc.head()
set_ht=((set_roc["threshold"]-0.5)**2).argmin()
print(set_ht,set_roc["threshold"].iloc[ht])
set_fpr=set_roc["fpr"].iloc[ht]
set_tpr=set_roc["tpr"].iloc[ht]
print(set_fpr,set_tpr)
print("Accuracy",(1-set_fpr)*np.mean(Y_test==0)+set_tpr*np.mean(Y_test==1))

In [None]:
plt.figure(figsize=(10,8))
plt.clf()
plt.subplot(111)
ax = plt.gca()

color = next(ax._get_lines.prop_cycler)['color']
plt.plot(set_roc["fpr"],set_roc["tpr"],label="Naive Bayes",color=color)
plt.plot([set_fpr], [set_tpr], marker='D', markersize=10, color=color)


color = next(ax._get_lines.prop_cycler)['color']
plt.plot(fpr,tpr,label="Logistic",color=color)
plt.plot([fpr[ht]], [tpr[ht]], marker='D', markersize=10, color=color)

plt.legend()
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")