In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from scipy.sparse import csr_matrix

import numpy as np
import modules.utils as Utils

In [3]:
X_train, X_test, y_train, y_test = Utils.readData("../01. Data Preprocessing/data/")

In [4]:
y_train

12859    0
10219    1
27839   -1
7528     1
10382    1
        ..
16734    1
16304    1
79       0
12119    0
14147    0
Name: label, Length: 26433, dtype: int64

In [5]:
type(y_train)

pandas.core.series.Series

### 28. Build model logistic

In [6]:
X_train_sparse = csr_matrix(X_train)
X_test_sparse = csr_matrix(X_test)

In [7]:
model = LogisticRegression().fit(X_train_sparse, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### 29. Dự đoán `y_hat`

In [8]:
y_hat = model.predict(X_test_sparse)

### 30. Đánh giá model

In [9]:
print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

          -1       0.78      0.86      0.82      2211
           0       0.66      0.62      0.64      2169
           1       0.78      0.76      0.77      2229

    accuracy                           0.74      6609
   macro avg       0.74      0.74      0.74      6609
weighted avg       0.74      0.74      0.74      6609



<hr>

In [10]:
model.score(X_train_sparse, y_train)

0.8265425793515682

In [11]:
model.score(X_test_sparse, y_test)

0.7433802390679377

> **Nhận xét**:
> * Accuracy trên train và test chênh nhau ko quá 10%.

In [12]:
confusion_matrix(y_test, y_hat)

array([[1892,  268,   51],
       [ 420, 1335,  414],
       [ 112,  431, 1686]])

### 31. GridSearchCV

In [13]:
grid = {
    "C": np.logspace(-3,3,7), 
    "penalty": ["l1", "l2"]
}

In [14]:
needle_model = GridSearchCV(LogisticRegression(), grid, cv=10).fit(X_train_sparse, y_train)

tions:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
  

In [15]:
print(">> Tuned hpyerparameters (best parameters): ", needle_model.best_params_)
print(">> Accuracy:", needle_model.best_score_)

>> Tuned hpyerparameters (best parameters):  {'C': 10.0, 'penalty': 'l2'}
>> Accuracy: 0.7435397387441379


In [16]:
new_model = LogisticRegression(C=10.0, penalty='l2').fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
new_y_hat = new_model.predict(X_test)

In [18]:
print(classification_report(y_test, new_y_hat))

              precision    recall  f1-score   support

          -1       0.81      0.89      0.85      2211
           0       0.67      0.61      0.64      2169
           1       0.76      0.75      0.76      2229

    accuracy                           0.75      6609
   macro avg       0.75      0.75      0.75      6609
weighted avg       0.75      0.75      0.75      6609



In [19]:
new_acc_train = new_model.score(X_train, y_train)
new_acc_train

0.8987629099988651

In [20]:
new_acc_test = new_model.score(X_test, y_test)
new_acc_test

0.7513996065970646

In [21]:
confusion_matrix(y_test, new_y_hat)

array([[1978,  194,   39],
       [ 370, 1317,  482],
       [  95,  463, 1671]])

> **Nhận xét**:
> * Sau khi áp dụng GridSearchCV thì model có cải thiện lên cả hai tập train và test với `C=10.0` và `penalty='l2'`.

### 32. Lưu trữ GridsearchCV

In [22]:
model_params = {
    "model_name": "LogisticRegression",
    "best_params": needle_model.best_params_,
    "accuracy_train_test": (new_acc_train, new_acc_test)
}

In [23]:
Utils.saveGridsearchCV(model_params, "./models/logistic/gridsearchcv.json")

### 33. Save model cho kết quả tốt nhất là `new_model`

In [39]:
Utils.saveModel(new_model, "./models/logistic/model.pickle")