In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from scipy.sparse import csr_matrix

import numpy as np
import modules.utils as Utils

In [3]:
X_train, X_test, y_train, y_test = Utils.readData("../01. Data Preprocessing/data/")

In [4]:
y_train

50585    0
52499    0
29622    1
14494    1
48895   -1
        ..
79       1
12119    1
14147    1
56088    0
38408   -1
Name: label, Length: 45009, dtype: int64

In [5]:
type(y_train)

pandas.core.series.Series

### 28. Build model logistic

In [6]:
X_train_sparse = csr_matrix(X_train)
X_test_sparse = csr_matrix(X_test)

In [7]:
model = LogisticRegression().fit(X_train_sparse, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### 29. Dự đoán `y_hat`

In [8]:
y_hat = model.predict(X_test_sparse)

### 30. Đánh giá model

In [9]:
print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

          -1       0.91      0.97      0.94      3785
           0       0.81      0.79      0.80      3786
           1       0.85      0.82      0.83      3682

    accuracy                           0.86     11253
   macro avg       0.86      0.86      0.86     11253
weighted avg       0.86      0.86      0.86     11253



<hr>

In [10]:
model.score(X_train_sparse, y_train)

0.9461885400697638

In [11]:
model.score(X_test_sparse, y_test)

0.8595929974229094

> **Nhận xét**:
> * Accuracy trên train và test chênh nhau ko quá 10%.

In [12]:
confusion_matrix(y_test, y_hat)

array([[3670,   88,   27],
       [ 292, 2990,  504],
       [  63,  606, 3013]])

### 31. GridSearchCV

In [13]:
grid = {
    "C": np.logspace(-3,3,7), 
    "penalty": ["l1", "l2"]
}

In [14]:
needle_model = GridSearchCV(LogisticRegression(), grid, cv=10).fit(X_train_sparse, y_train)

tions:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
  

In [15]:
print(">> Tuned hpyerparameters (best parameters): ", needle_model.best_params_)
print(">> Accuracy:", needle_model.best_score_)

>> Tuned hpyerparameters (best parameters):  {'C': 10.0, 'penalty': 'l2'}
>> Accuracy: 0.8755941544556315


In [18]:
new_model = LogisticRegression(C=10.0, penalty='l2').fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:
new_y_hat = new_model.predict(X_test)

In [21]:
print(classification_report(y_test, new_y_hat))

              precision    recall  f1-score   support

          -1       0.93      0.99      0.96      3788
           0       0.84      0.82      0.83      3843
           1       0.86      0.83      0.85      3626

    accuracy                           0.88     11257
   macro avg       0.88      0.88      0.88     11257
weighted avg       0.88      0.88      0.88     11257



In [26]:
new_acc_train = new_model.score(X_train, y_train)
new_acc_train

0.9880069293297206

In [28]:
new_acc_test = new_model.score(X_test, y_test)
new_acc_test

0.8805187883094963

In [24]:
confusion_matrix(y_test, new_y_hat)

array([[3741,   36,   11],
       [ 223, 3156,  464],
       [  59,  552, 3015]])

> **Nhận xét**:
> * Sau khi áp dụng GridSearchCV thì model có cải thiện lên cả hai tập train và test với `C=10.0` và `penalty='l2'`.

### 32. Lưu trữ GridsearchCV

In [29]:
model_params = {
    "model_name": "LogisticRegression",
    "best_params": needle_model.best_params_,
    "accuracy_train_test": (new_acc_train, new_acc_test)
}

In [31]:
Utils.saveGridsearchCV(model_params, "./models/logistic/gridsearchcv.json")

### 33. Save model cho kết quả tốt nhất là `new_model`

In [39]:
Utils.saveModel(new_model, "./models/logistic/model.pickle")