<a href="https://colab.research.google.com/github/eriksali/DNN_2023_NLP/blob/main/NLP_11_logistic_regression_held_out.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
'''
CSI 5900: Lecture 11 Code Examples
Prof. Steven Wilson, Oakland University

Logistic Regression for Binary Classification

We will use a dataset of IMDB reviews from:

Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts. (2011). 
"Learning Word Vectors for Sentiment Analysis." The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011).
'''

! wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
! tar -xzf aclImdb_v1.tar.gz

##! cat aclImdb/README

import glob
pos_train_files = glob.glob('aclImdb/train/pos/*')
neg_train_files = glob.glob('aclImdb/train/neg/*')
##print(pos_train_files[:5]) 

from sklearn.feature_extraction.text import TfidfVectorizer
# only use 1000 data points per class for now to make things faster/simpler
num_files_per_class = 1000
all_train_files = pos_train_files[:num_files_per_class] + neg_train_files[:num_files_per_class]
vectorizer = TfidfVectorizer(input="filename", stop_words="english")
vectors = vectorizer.fit_transform(all_train_files)
##vectors

##len(vectorizer.vocabulary_)

##vectors[0].sum()

X = vectors
y = [1] * num_files_per_class + [0] * num_files_per_class
##len(y)

import numpy as np
x_0 = X[0]
w = np.zeros(X.shape[1])
x_0_dense = x_0.todense()
x_0.dot(w)

import random
import numpy as np
from scipy.special import expit


# Cross-entropy

def sgd_for_lr_with_ce(X, y, num_passes=5, learning_rate = 0.1):

    num_data_points = X.shape[0]

    # Initialize theta -> 0
    num_features = X.shape[1]
    w = np.zeros(num_features)
    b = 0.0

    # repeat until done
    # how to define "done"? let's just make it num passes for now
    # we can also do norm of gradient and when it is < epsilon (something tiny)
    # we stop

    for current_pass in range(num_passes):
        
        # iterate through entire dataset in random order
        order = list(range(num_data_points))
        random.shuffle(order)
        for i in order:

            # compute y-hat for this value of i given y_i and x_i
            x_i = X[i]
            y_i = y[i]

            # need to compute based on w and b
            # sigmoid(w dot x + b)
            z = x_i.dot(w) + b
            y_hat_i = expit(z)

            # for each w (and b), modify by -lr * (y_hat_i - y_i) * x_i
            w = w - learning_rate * (y_hat_i - y_i) * x_i
            b = b - learning_rate * (y_hat_i - y_i)

    # return theta
    return w,b

w,b = sgd_for_lr_with_ce(X,y)

#w

sorted_vocab = sorted([(k,v) for k,v in vectorizer.vocabulary_.items()],key=lambda x:x[1])
sorted_vocab = [a for (a,b) in sorted_vocab]

sorted_words_weights = sorted([x for x in zip(sorted_vocab, w)], key=lambda x:x[1])
sorted_words_weights[-50:]

# get the predictions
def predict_y_lr(w,b,X,threshold=0.5):

    # use our matrix operation version of the logistic regression model
    # X dot w + b
    # need to make w a column vector so the dimensions line up correctly
    y_hat = X.dot( w.reshape((-1,1)) ) + b

    # then just check if it's > threshold
    preds = np.where(y_hat > threshold,1,0)

    return preds

preds = predict_y_lr(w,b,X)

preds

# compute training set results
from sklearn.metrics import classification_report
w,b = sgd_for_lr_with_ce(X, y, num_passes=10)
y_pred = predict_y_lr(w,b,X)
print(classification_report(y, y_pred))



--2023-03-17 23:02:09--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz.3’


2023-03-17 23:02:15 (12.2 MB/s) - ‘aclImdb_v1.tar.gz.3’ saved [84125825/84125825]

              precision    recall  f1-score   support

           0       0.91      0.99      0.95      1000
           1       0.99      0.90      0.94      1000

    accuracy                           0.95      2000
   macro avg       0.95      0.95      0.95      2000
weighted avg       0.95      0.95      0.95      2000



In [6]:
##! wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
##! tar -xzf aclImdb_v1.tar.gz

from sklearn.datasets import load_files

# Load the data
train_data = load_files('aclImdb/train/', categories=['pos', 'neg'], shuffle=True, random_state=42)
test_data = load_files('aclImdb/test/', categories=['pos', 'neg'], shuffle=True, random_state=42)

# Extract the text and labels from the data
X_train, y_train = train_data.data, train_data.target
X_test, y_test = test_data.data, test_data.target

from sklearn.feature_extraction.text import CountVectorizer

# Convert the text data into bag-of-words features
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

# Train the logistic regression model
clf = LogisticRegression(random_state=42)
clf.fit(X_train, y_train)

# Evaluate the model on the testing set
y_pred = clf.predict(X_test)
'''acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print('Accuracy:', acc)
print('F1-score:', f1)'''

from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))


--2023-03-17 23:04:36--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz.5’


2023-03-17 23:04:42 (13.5 MB/s) - ‘aclImdb_v1.tar.gz.5’ saved [84125825/84125825]

              precision    recall  f1-score   support

           0       0.86      0.87      0.87     12500
           1       0.87      0.86      0.86     12500

    accuracy                           0.86     25000
   macro avg       0.86      0.86      0.86     25000
weighted avg       0.86      0.86      0.86     25000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.86      0.87      0.87     12500
           1       0.87      0.86      0.86     12500

    accuracy                           0.86     25000
   macro avg       0.86      0.86      0.86     25000
weighted avg       0.86      0.86      0.86     25000

