In [1]:
import numpy as np
import pandas as pd
import os

import sklearn.linear_model
import sklearn.pipeline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle

In [2]:
# imports training data from files
def load_train_data():
    data_dir = "data_reviews"
    x_train_df = pd.read_csv(os.path.join(data_dir, "x_train.csv"))
    y_train_df = pd.read_csv(os.path.join(data_dir, "y_train.csv"))
    return x_train_df, y_train_df

In [3]:
def load_test_data():
    data_dir = "data_reviews"
    x_test_df = pd.read_csv(os.path.join(data_dir, "x_test.csv"))
    return x_test_df

In [4]:
def print_word_freq(x_train_N):
    # min_df sets sets a minimum number of times a given token needs to be
    # included in a text entry to be a part of the vector
    vectorizer = CountVectorizer(min_df=4, binary=False)
    x_vec_N = vectorizer.fit_transform(x_train_N)

    dense_arr = x_vec_N.toarray()

    freq = [
        (term, np.sum(dense_arr[:, index]))
        for term, index in list(vectorizer.vocabulary_.items())
    ]

    for term, count in sorted(freq, key=lambda x: x[1], reverse=True):
        print(f"{term} -- {count}")

In [5]:
# pipeline for BoW representation + logistic regression classifier
def make_bow_classifier_pipeline():
    pipeline = sklearn.pipeline.Pipeline(
        steps=[
            # turn data into BoW feature representation
            (
                "bow_feature_extractor",
                CountVectorizer(min_df=2, max_df=1.0, ngram_range=(1, 1)),
            ),
            # TODO: add cross validation
            # Given features construct the classifier (w/ hyperparam selection)
            (
                "classifier",
                sklearn.linear_model.LogisticRegression(C=1.0),
            ),
        ]
    )

    return pipeline

In [6]:
def make_hyperparam_grid():

    hyperparam_grid = {}

    hyperparam_grid["bow_feature_extractor__min_df"] = [1, 2, 3, 4]
    hyperparam_grid["bow_feature_extractor__max_df"] = [1.0, 0.9, 0.8, 0.7, 0.6, 0.5]
    hyperparam_grid["classifier__C"] = np.logspace(-1, 1, 11)

    return hyperparam_grid

# Import data and train model using GridSearch

In [7]:

x_train_df, y_train_df = load_train_data()

x_test_df = load_test_data()

# getting rid of company label
x_train_N = x_train_df.to_numpy()[:, 1]
y_train_N = y_train_df.to_numpy()[:, 0]

x_test_N = x_test_df.to_numpy()[:, 1]

# shuffle dataset
x_train_N, y_train_N = shuffle(x_train_N, y_train_N, random_state=2)

pipeline = make_bow_classifier_pipeline()
hyperparam_grid = make_hyperparam_grid()

grid_searcher = sklearn.model_selection.GridSearchCV(
    pipeline,
    hyperparam_grid,
    cv=10,
    scoring="roc_auc"
)

grid_searcher.fit(x_train_N, y_train_N)

best_model = grid_searcher.best_estimator_
best_params = grid_searcher.best_params_
best_score = grid_searcher.best_score_

yhat_proba_test_N = best_model.predict_proba(x_test_N)

np.savetxt("yproba1_test.txt", yhat_proba_test_N[:,1])

print(f"best score: {best_score}; best params:", best_params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

best score: 0.8893055555555556; best params: {'bow_feature_extractor__max_df': 1.0, 'bow_feature_extractor__min_df': 1, 'classifier__C': 1.584893192461114}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# GridSearch results 

In [8]:
grid_search_results_df = pd.DataFrame(grid_searcher.cv_results_).copy()

param_keys = ['param_bow_feature_extractor__min_df', 'param_bow_feature_extractor__max_df', 'param_classifier__C']

grid_search_results_df.sort_values(param_keys, inplace=True)
grid_search_results_df[param_keys + ['split0_test_score', 'rank_test_score']]

print(grid_search_results_df)

     mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
220       0.025285      0.001497         0.001587        0.000098   
221       0.027558      0.000856         0.001531        0.000059   
222       0.032369      0.006954         0.001532        0.000039   
223       0.033038      0.001131         0.001583        0.000117   
224       0.035989      0.002011         0.001586        0.000055   
..             ...           ...              ...             ...   
39        0.025815      0.001723         0.001707        0.000464   
40        0.026806      0.000802         0.001477        0.000039   
41        0.028790      0.000277         0.001509        0.000059   
42        0.028938      0.000503         0.001623        0.000216   
43        0.028859      0.000372         0.001589        0.000215   

    param_bow_feature_extractor__max_df param_bow_feature_extractor__min_df  \
220                                 0.5                                   1   
221          