### Support Vector Machine Module

In [None]:
def svm_classifier_sampling(train, test, vectorizer, sampling_type, ngram_range = None, use_params = False):
    if vectorizer == 'cv':
        vctr = CountVectorizer()
    elif vectorizer == 'ngram':
        vctr = CountVectorizer(ngram_range=ngram_range)
    elif vectorizer == 'tfidf':
        vctr = TfidfVectorizer(ngram_range=ngram_range)
    
    sc = StandardScaler()
    train_review = train['review']
    test_review = test['review']
    
    train_review = vctr.fit_transform(train_review).toarray()
    test_review = vctr.transform(test_review).toarray()
    
    train_review = sc.fit_transform(train_review)
    test_review = sc.transform(test_review)
    
    train_label = train['sentiment']
    test_label = test['sentiment']
    
    if sampling_type == 'over':
        ros = RandomOverSampler(random_state=10)
        ros_train_review, ros_train_label = ros.fit_sample(train_review, train_label)
    elif sampling_type == 'under':
        ros = RandomUnderSampler(random_state=10)
        ros_train_review, ros_train_label = ros.fit_sample(train_review, train_label)        
    elif sampling_type == 'smote':
        ros = SMOTE(kind='svm')
        ros_train_review, ros_train_label = ros.fit_sample(train_review, train_label)
    else:
        ros_train_review = train_review
        ros_train_label = train_label
        
    if(use_params):
        params = find_params(ros_train_review, ros_train_label)
        print('Best params: %s' % (params))
        svm_clf = SVC(C=params['C'], gamma=params['gamma'])
    else:
        svm_clf = SVC()
    svm_clf.fit(ros_train_review, ros_train_label)
    
    label_pred = svm_clf.predict(test_review)
    
    cm = confusion_matrix(test_label, label_pred)
    
    accuracy = accuracy_score(label_pred, test_label)
    
    return label_pred, cm, svm_clf, accuracy, label_pred, test_label

In [None]:
def find_params(x_train, y_train):
    C_range = np.logspace(6, 13, 15)
    gamma_range = np.logspace(-13,-10,15)
    param_grid = dict(gamma=gamma_range, C=C_range)
    cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
    grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
    grid.fit(x_train, y_train)
    
    # Draw plot for gamma-C value result
    score_dict = grid.grid_scores_

    scores = [x[1] for x in score_dict]
    scores = np.array(scores).reshape(len(C_range), len(gamma_range))
    
    plt.figure(figsize=(8, 6))
    plt.subplots_adjust(left=0.15, right=0.95, bottom=0.15, top=0.95)
    plt.imshow(scores, interpolation='nearest', cmap=plt.cm.get_cmap("Spectral"))
    plt.xlabel('gamma')
    plt.ylabel('C')
    plt.colorbar()
    plt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45)
    plt.yticks(np.arange(len(C_range)), C_range)
    plt.show()
    
    return grid.best_params_