In [1]:
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import pandas as pd
import utils




In [2]:
def load_data():
    dataset = pd.read_excel('data.xlsx')
    print("\n")
    print('Loading Dataset shape: {}'.format(dataset.shape))
    return dataset

In [3]:
def train_test_lazy_classifier(X, y, vectorizer):
    X_train, X_test, y_train, y_test = train_test_split(X,
                                            y, stratify=y, random_state=0)
    
    X_train_vectors = vectorizer.fit_transform(X_train)
    X_test_vectors = vectorizer.transform(X_test)

    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
    models, predictions = clf.fit(X_train_vectors.A, X_test_vectors.A, y_train, y_test)
    print("\n")
    #get best F1 Score model
    models = models.sort_values(by=['ROC AUC'] , ascending=False)
    print(models)
    print("Best Model: {}".format(models.index[0]))
    best_model = clf.models[models.index[0]]
    return vectorizer, best_model


In [4]:
dataset = load_data()
dataset = utils.preprocess(dataset)
X, y = dataset['preprocessed_text'], dataset['target']
vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1,1))
vectorizer, best_model = train_test_lazy_classifier(X, y, vectorizer)

  0%|          | 0/29 [00:00<?, ?it/s]

Loading Dataset shape: (1111, 5)
100%|██████████| 29/29 [00:21<00:00,  1.36it/s]

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
LogisticRegression                 0.71               0.66     0.66      0.71   
NearestCentroid                    0.70               0.65     0.65      0.70   
GaussianNB                         0.64               0.65     0.65      0.66   
LinearSVC                          0.68               0.64     0.64      0.69   
NuSVC                              0.74               0.63     0.63      0.71   
BernoulliNB                        0.73               0.63     0.63      0.71   
Perceptron                         0.67               0.63     0.63      0.67   
PassiveAggressiveClassifier        0.68               0.63     0.63      0.68   
SGDClassifier                      0.74               0.62     0.62 

In [5]:
log_reg_pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('log_reg', LogisticRegression())
])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)

In [7]:
params = {
 'tfidf__max_features':[200, 500, 1000, 2000],
 'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2), (1, 3), (2, 3), (3, 3)],
 'log_reg__C': [1e-5, 1e-3, 1e-1, 1e0, 1e1, 1e2]
}

In [8]:
grid_search = GridSearchCV(log_reg_pipe, cv=5, param_grid=params, n_jobs=-1, scoring='f1')

In [9]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('log_reg', LogisticRegression())]),
             n_jobs=-1,
             param_grid={'log_reg__C': [1e-05, 0.001, 0.1, 1.0, 10.0, 100.0],
                         'tfidf__max_features': [200, 500, 1000, 2000],
                         'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2), (1, 3),
                                                (2, 3), (3, 3)]},
             scoring='f1')

In [10]:
grid_search.best_estimator_

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_features=1000, ngram_range=(1, 3))),
                ('log_reg', LogisticRegression(C=100.0))])

In [11]:
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(params.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best score: 0.485
Best parameters set:
	log_reg__C: 100.0
	tfidf__max_features: 1000
	tfidf__ngram_range: (1, 3)


In [12]:
y_pred = grid_search.best_estimator_.predict(X_test)

In [13]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.77      0.76       191
           1       0.46      0.43      0.44        87

    accuracy                           0.67       278
   macro avg       0.60      0.60      0.60       278
weighted avg       0.66      0.67      0.66       278



In [14]:
log_reg_pipe.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('log_reg', LogisticRegression())])

In [15]:
y_pred2 = log_reg_pipe.predict(X_test)

In [16]:
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

           0       0.73      0.96      0.83       191
           1       0.69      0.21      0.32        87

    accuracy                           0.72       278
   macro avg       0.71      0.58      0.57       278
weighted avg       0.72      0.72      0.67       278

