# Problem A: Bag off Words and Logistic Regression Classifier

In [78]:
import numpy as np
import pandas as pd
import os

import sklearn.linear_model
import sklearn.pipeline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle

In [79]:
# imports training data from files
def load_train_data():
    data_dir = "data_reviews"
    x_train_df = pd.read_csv(os.path.join(data_dir, "x_train.csv"))
    y_train_df = pd.read_csv(os.path.join(data_dir, "y_train.csv"))
    return x_train_df, y_train_df

In [80]:
def split_by_website(x_N):

    N = x_N.shape[0]

    x_amazon = [(x_N[i,1], i) for i in range(N) if x_N[i,0] == "amazon"]
    x_imdb = [(x_N[i,1], i) for i in range(N) if x_N[i,0] == "imdb"]
    x_yelp = [(x_N[i,1], i) for i in range(N) if x_N[i,0] == "yelp"]
    
    return x_amazon, x_imdb, x_yelp

In [81]:
def load_test_data():
    data_dir = "data_reviews"
    x_test_df = pd.read_csv(os.path.join(data_dir, "x_test.csv"))
    return x_test_df

In [82]:
def print_word_freq(x_train_N):
    # min_df sets sets a minimum number of times a given token needs to be
    # included in a text entry to be a part of the vector
    vectorizer = CountVectorizer(min_df=4, binary=False)
    x_vec_N = vectorizer.fit_transform(x_train_N)

    dense_arr = x_vec_N.toarray()

    freq = [
        (term, np.sum(dense_arr[:, index]))
        for term, index in list(vectorizer.vocabulary_.items())
    ]

    for term, count in sorted(freq, key=lambda x: x[1], reverse=True):
        print(f"{term} -- {count}")

In [83]:
# pipeline for BoW representation + logistic regression classifier
def make_bow_classifier_pipeline():
    pipeline = sklearn.pipeline.Pipeline(
        steps=[
            # turn data into BoW feature representation
            (
                "bow_feature_extractor",
                CountVectorizer(min_df=2, max_df=1.0, ngram_range=(1, 1)),
            ),
            # Given features construct the classifier (w/ hyperparam selection)
            (
                "classifier",
                sklearn.linear_model.LogisticRegression(C=1.0, max_iter=100000),
            ),
        ]
    )

    return pipeline

In [84]:
def make_hyperparam_grid():

    hyperparam_grid = {}

    hyperparam_grid["bow_feature_extractor__min_df"] = [1, 2, 3, 4, 8]
    hyperparam_grid["bow_feature_extractor__max_df"] = [1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3]
    hyperparam_grid["classifier__C"] = np.logspace(-2, 2, 20)

    return hyperparam_grid

# Import data and train model using GridSearch

In [85]:
x_train_df, y_train_df = load_train_data()

x_test_df = load_test_data()

x_amazon, x_imdb, x_yelp = split_by_website(x_train_df.to_numpy())

In [86]:
amazon_text, amazon_indices = zip(*x_amazon)
imdb_text, imdb_indices = zip(*x_imdb)
yelp_text, yelp_indices = zip(*x_yelp)

amazon_text = np.asarray(list(amazon_text))
amazon_indices = np.asarray(list(amazon_indices))

imdb_text = np.asarray(list(imdb_text))
imdb_indices = np.asarray(list(imdb_indices))

yelp_text = np.asarray(list(yelp_text))
yelp_indices = np.asarray(list(yelp_indices))

y_train_N = y_train_df.to_numpy()

y_amazon = y_train_N[amazon_indices][:,0]
y_imdb = y_train_N[imdb_indices][:,0]
y_yelp = y_train_N[yelp_indices][:,0]

In [87]:
amazon_text, y_amazon = shuffle(amazon_text, y_amazon, random_state=2)
imdb_text, y_imdb = shuffle(imdb_text, y_imdb, random_state=2)
yelp_text, y_yelp = shuffle(yelp_text, y_yelp, random_state=2)

In [88]:
amazon_pipeline = make_bow_classifier_pipeline()
amazon_hyperparam_grid = make_hyperparam_grid()

amazon_grid_searcher = sklearn.model_selection.GridSearchCV(
    amazon_pipeline,
    amazon_hyperparam_grid,
    cv=10,
    scoring="roc_auc"
)

imdb_pipeline = make_bow_classifier_pipeline()
imdb_hyperparam_grid = make_hyperparam_grid()

imdb_grid_searcher = sklearn.model_selection.GridSearchCV(
    imdb_pipeline,
    imdb_hyperparam_grid,
    cv=10,
    scoring="roc_auc"
)

yelp_pipeline = make_bow_classifier_pipeline()
yelp_hyperparam_grid = make_hyperparam_grid()

yelp_grid_searcher = sklearn.model_selection.GridSearchCV(
    yelp_pipeline,
    yelp_hyperparam_grid,
    cv=10,
    scoring="roc_auc"
)

grid_searchers = [amazon_grid_searcher, imdb_grid_searcher, yelp_grid_searcher]


grid_searchers[0].fit(amazon_text, y_amazon)
grid_searchers[1].fit(imdb_text, y_imdb)
grid_searchers[2].fit(yelp_text, y_yelp)


In [89]:
best_models = [grid_searchers[i].best_estimator_ for i in range(3)]

x_test_amazon, x_test_imdb, x_test_yelp = split_by_website(x_test_df.to_numpy())

x_test_amazon_text, _ = zip(*x_test_amazon)
x_test_imdb_text, _ = zip(*x_test_imdb)
x_test_yelp_text, _ = zip(*x_test_yelp)

x_test_amazon_text = np.asarray(x_test_amazon_text)
x_test_imdb_text = np.asarray(x_test_imdb_text)
x_test_yelp_text = np.asarray(x_test_yelp_text)

In [90]:
yhat_amazon = best_models[0].predict_proba(x_test_amazon_text)[:,1]
yhat_imdb = best_models[1].predict_proba(x_test_imdb_text)[:,1]
yhat_yelp = best_models[2].predict_proba(x_test_yelp_text)[:,1]

yhat_test_N = np.concatenate((yhat_amazon, yhat_imdb, yhat_yelp))

np.savetxt("yproba1_test.txt", yhat_test_N)

for searcher in grid_searchers:
    best_params = searcher.best_params_
    best_score = searcher.best_score_

    print(f"best score: {best_score}; best params:", best_params)
    



best score: 0.89925; best params: {'bow_feature_extractor__max_df': 1.0, 'bow_feature_extractor__min_df': 1, 'classifier__C': 37.92690190732246}
best score: 0.8520000000000001; best params: {'bow_feature_extractor__max_df': 0.3, 'bow_feature_extractor__min_df': 1, 'classifier__C': 3.359818286283781}
best score: 0.89175; best params: {'bow_feature_extractor__max_df': 0.3, 'bow_feature_extractor__min_df': 1, 'classifier__C': 3.359818286283781}


In [91]:
# getting rid of company label
x_train_N = x_train_df.to_numpy()[:, 1]
y_train_N = y_train_df.to_numpy()[:, 0]

x_test_N = x_test_df.to_numpy()[:, 1]

# shuffle dataset
x_train_N, y_train_N = shuffle(x_train_N, y_train_N, random_state=2)

In [92]:
pipeline = make_bow_classifier_pipeline()
hyperparam_grid = make_hyperparam_grid()

grid_searcher = sklearn.model_selection.GridSearchCV(
    pipeline,
    hyperparam_grid,
    cv=10,
    scoring="roc_auc"
)

grid_searcher.fit(x_train_N, y_train_N)

best_model = grid_searcher.best_estimator_
best_params = grid_searcher.best_params_
best_score = grid_searcher.best_score_

yhat_proba_test_N = best_model.predict_proba(x_test_N)

np.savetxt("yproba1_test.txt", yhat_proba_test_N[:,1])

print(f"best score: {best_score}; best params:", best_params)

KeyboardInterrupt: 

# GridSearch results 

In [None]:
grid_search_results_df = pd.DataFrame(grid_searcher.cv_results_).copy()

param_keys = ['param_bow_feature_extractor__min_df', 'param_bow_feature_extractor__max_df', 'param_classifier__C']

grid_search_results_df.sort_values(param_keys, inplace=True)
grid_search_results_df[param_keys + ['split0_test_score', 'rank_test_score']]

Unnamed: 0,param_bow_feature_extractor__min_df,param_bow_feature_extractor__max_df,param_classifier__C,split0_test_score,rank_test_score
220,1,0.5,0.1,0.845625,211
221,1,0.5,0.158489,0.854583,163
222,1,0.5,0.251189,0.860833,85
223,1,0.5,0.398107,0.866736,67
224,1,0.5,0.630957,0.870486,31
...,...,...,...,...,...
39,4,1.0,1.584893,0.855625,157
40,4,1.0,2.511886,0.851111,187
41,4,1.0,3.981072,0.845486,223
42,4,1.0,6.309573,0.837500,247
