In [1]:
%store -r x_train_tfidf
%store -r x_test_tfidf
%store -r y_train
%store -r y_test
%store -r x_train
%store -r x_test

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# ignore ConvergenceWarnings
from  warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

In [5]:
models = [
    { "estimater": MultinomialNB(),
      "params": {
          "fit_prior": [True, False]    
      }
    },
    { "estimater": LinearSVC(),
      "params": {
          "loss": ['hinge','squared_hinge'],
          "multi_class": ['ovr', 'crammer_singer'],
          "fit_intercept": [True, False],
          "random_state": [42],
          "max_iter": [900, 1000, 1100]
      }
    },
    { "estimater": SGDClassifier(),
      "params": {
          "loss": ['hinge','log','perceptron'], 
          "penalty": ['l2', 'l1'], 
          "alpha": [0.0001, 0.0003, 0.0010],
          "early_stopping": [True],
          "max_iter": [1000, 1500],
          "random_state": [42]
      }
    }
]

column_names = ["MultiNB", "SVC", "SGDClassifier"]
entries = []

highest_acc = 0
best_model = None

for model in models:
    print(model["estimater"])

  # Create a based model
    clf = model["estimater"]
  # Instantiate the grid search model
    grid_search = GridSearchCV(estimator = clf, param_grid = model["params"], 
                            cv = 3, n_jobs = 1)
  # Fit the model
    grid_search.fit(x_train_tfidf, y_train);

  # Make a prediction on the test split to find model accuracy
    predicted = grid_search.predict(x_test_tfidf)
    acc = accuracy_score(predicted, y_test)
    entries.append(acc)

    print(grid_search.best_params_)

  # If model have the highest accuracy, it's out best model
    if acc > highest_acc:
        highest_acc = acc
        best_model = grid_search

MultinomialNB()
{'fit_prior': False}
LinearSVC()
{'fit_intercept': True, 'loss': 'hinge', 'max_iter': 900, 'multi_class': 'crammer_singer', 'random_state': 42}
SGDClassifier()
{'alpha': 0.0001, 'early_stopping': True, 'loss': 'hinge', 'max_iter': 1000, 'penalty': 'l2', 'random_state': 42}


In [4]:
classifier = LinearSVC(fit_intercept= True, loss= 'hinge', max_iter= 900, multi_class= 'crammer_singer', random_state= 42)
classifier.fit(x_train_tfidf, y_train)
y_pred = classifier.predict(x_test_tfidf)

# Testing the accuracy
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy ' + str(accuracy_score(y_test, y_pred)))

[[1197  147   12  114   32   54  297   35   71    8  110   90  248  151
    12   19   10   23]
 [ 191  879   22   78   54   55  510   37   88   13   87   90  287  178
    16   19    7   18]
 [  36   26  891  114   17   42   30   34  104   52   28   81   37   46
    18   22   35    5]
 [  27   26   24 5038   20   57   45   43  100   15   11  189   37   34
    58   13   19    8]
 [  55   57   16   50  864   41  144   35   40    5   35   51   54   49
    12   23   11    8]
 [  55   46   47  174   26 1024  116  148   84   14   56  705   80  103
    26   61   12   15]
 [ 224  199   26  104   49   72 2619   33   90   12   95   90  264  263
    26   17    9   19]
 [  46   20   21   92   16   91   53 1926   47   10   17  248   26   17
    20   92    9   11]
 [  69   51   87  226   23   38  116   59 2549   31   45  132   59   61
    26   28   33   17]
 [  14   12   85   80    7   14   29   23   42  706   10   43   17   25
    17   12   12    6]
 [ 195  108   29   61   25   74  184   39   68   1

In [6]:
import joblib
filename = 'ml_model.sav'
joblib.dump(classifier, filename)

['ml_model.sav']