### Support Vector Machine Module

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_fscore_support

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [3]:
train_dataset_raw = pd.read_csv('./data/cleaned_train.csv', sep=',')
train_dataset_raw.columns = ['review', 'sentiment']

test_dataset_raw = pd.read_csv('./data/cleaned_test.csv', sep=',')
test_dataset_raw.columns = ['review', 'sentiment']

In [3]:
train_dataset_raw = pd.concat([dataset.loc[dataset['sentiment'] == 0].iloc[0:1000], dataset.loc[dataset['sentiment'] == 1].iloc[0:1000]])
test_dataset_raw = pd.concat([dataset.loc[dataset['sentiment'] == 0].iloc[1001:1500], dataset.loc[dataset['sentiment'] == 1].iloc[1001:1500]])

In [4]:
train_dataset_raw.head()

Unnamed: 0,review,sentiment
0,dived many times for the ball managed to save ...,0
1,not the whole crew,0
2,nope they did not have it,0
3,spring break in plain city it snowing,0
4,could not bear to watch it and thought the ua ...,0


In [4]:
def reduce_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

print (reduce_lengthening( "finallllllly" ))

finally


In [5]:
from collections import Counter

def words(text): return re.findall(r'\w+', text.lower())

WORDS = Counter(words(open('big.txt').read()))

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [6]:
def data_preprocess(data, norm_type, stop_words_list=None, spell_correction=False):
    stop_words = ['in', 'of', 'the', 'at', 'a', 'an', 'is', 'are', 'am', 'was', 'were']
    
    if(stop_words_list=='corpus'):
        data['review'] = [' '.join([word for word in x.split() if word not in set(stopwords.words('english'))]) 
                                   for x in data['review'].tolist()] # remove stop words
    elif(stop_words_list=='manual'):
        data['review'] = [' '.join([word for word in x.split() if word not in stop_words]) 
                               for x in data['review'].tolist()] # remove stop words
    if spell_correction:
        data['review'] = [' '.join([correction(reduce_lengthening(word)) for word in x.split()]) 
                               for x in data['review'].tolist()] #stem words   
    if norm_type == 'stem':
        normalizer = PorterStemmer()
        data['review'] = [' '.join([normalizer.stem(word) for word in x.split()]) 
                               for x in data['review'].tolist()] #stem words
    elif norm_type == 'lemma':
        normalizer = WordNetLemmatizer()
        data['review'] = [' '.join([normalizer.lemmatize(word) for word in x.split()]) 
                               for x in data['review'].tolist()] #lemmatize words
        
    return data

In [7]:
def svm_classifier_sampling(train, test, vectorizer, sampling_type, ngram_range = None, use_params = False):
    if vectorizer == 'cv':
        vctr = CountVectorizer()
    elif vectorizer == 'ngram':
        vctr = CountVectorizer(ngram_range=ngram_range)
    elif vectorizer == 'tfidf':
        vctr = TfidfVectorizer(ngram_range=ngram_range, max_df=0.7, min_df=2)
    
    sc = StandardScaler()
    train_review = train['review']
    test_review = test['review']
    
    train_review = vctr.fit_transform(train_review).toarray()
    test_review = vctr.transform(test_review).toarray()
    
    train_review = sc.fit_transform(train_review)
    test_review = sc.transform(test_review)
    
    train_label = train['sentiment']
    test_label = test['sentiment']
    
    if sampling_type == 'over':
        ros = RandomOverSampler(random_state=10)
        ros_train_review, ros_train_label = ros.fit_sample(train_review, train_label)
    elif sampling_type == 'under':
        ros = RandomUnderSampler(random_state=10)
        ros_train_review, ros_train_label = ros.fit_sample(train_review, train_label)        
    elif sampling_type == 'smote':
        ros = SMOTE(kind='svm')
        ros_train_review, ros_train_label = ros.fit_sample(train_review, train_label)
    else:
        ros_train_review = train_review
        ros_train_label = train_label
        
    if(use_params):
        params = find_params(ros_train_review, ros_train_label)
        print('Best params: %s' % (params))
        svm_clf = SVC(C=params['C'], gamma=params['gamma'])
    else:
        svm_clf = SVC(C=0.001, gamma=3.727593720314938e-09)
    svm_clf.fit(ros_train_review, ros_train_label)
    
    label_pred = svm_clf.predict(test_review)
    
    cm = confusion_matrix(test_label, label_pred)
    
    accuracy = accuracy_score(label_pred, test_label)
    
    return label_pred, cm, svm_clf, accuracy, test_label

In [8]:
def find_params(x_train, y_train):
    C_range = np.logspace(-3, 13, 17)
    gamma_range = np.logspace(-13,3,17)
    param_grid = dict(gamma=gamma_range, C=C_range)
    cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
    grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
    grid.fit(x_train, y_train)
    
    # Draw plot for gamma-C value result
    score_dict = grid.grid_scores_

    scores = [x[1] for x in score_dict]
    scores = np.array(scores).reshape(len(C_range), len(gamma_range))
    
    plt.figure(figsize=(8, 6))
    plt.subplots_adjust(left=0.15, right=0.95, bottom=0.15, top=0.95)
    plt.imshow(scores, interpolation='nearest', cmap=plt.cm.get_cmap("Spectral"))
    plt.xlabel('gamma')
    plt.ylabel('C')
    plt.colorbar()
    plt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45)
    plt.yticks(np.arange(len(C_range)), C_range)
    plt.show()
    
    return grid.best_params_

In [128]:
train_dataset = train_dataset_raw.copy()
test_dataset = test_dataset_raw.copy()

train_dataset = data_preprocess(train_dataset, 'stem')
test_dataset = data_preprocess(test_dataset, 'stem')

In [116]:
label_pred, cm, clf, accuracy, test_label = svm_classifier_sampling(train_dataset, test_dataset, 'tfidf', sampling_type = False, ngram_range = (1,2), use_params = False)

In [118]:
accuracy

0.5721442885771543

In [119]:
cm

array([[472,  27],
       [400,  99]])

In [121]:
label_pred_2, cm_2, clf_2, accuracy_2, test_label_2 = svm_classifier_sampling(train_dataset, test_dataset, 'tfidf', sampling_type = 'under', ngram_range = (1,2), use_params = False)

In [122]:
accuracy_2

0.5711422845691383

In [123]:
cm_2

array([[472,  27],
       [401,  98]])

In [None]:
label_pred_3, cm_3, clf_3, accuracy_3, test_label_3 = svm_classifier_sampling(train_dataset, test_dataset, 'tfidf', sampling_type = 'under', ngram_range = (1,2), use_params = True)

In [110]:
accuracy_3

0.5721442885771543

In [111]:
cm_3

array([[472,  27],
       [400,  99]])

In [136]:
train_dataset = train_dataset_raw.copy()
test_dataset = test_dataset_raw.copy()

train_dataset = data_preprocess(train_dataset, 'stem', stop_words_list='manual')
test_dataset = data_preprocess(test_dataset, 'stem', stop_words_list='manual')

In [131]:
label_pred_4, cm_4, clf_4, accuracy_4, test_label_4 = svm_classifier_sampling(train_dataset, test_dataset, 'tfidf', sampling_type = False, ngram_range = (1,2), use_params = False)

In [132]:
accuracy_4

0.5621242484969939

In [133]:
cm_4

array([[466,  33],
       [404,  95]])

In [148]:
train_dataset = train_dataset_raw.copy()
test_dataset = test_dataset_raw.copy()

train_dataset = data_preprocess(train_dataset, 'stem', stop_words_list='corpus')
test_dataset = data_preprocess(test_dataset, 'stem', stop_words_list='corpus')

In [137]:
label_pred_5, cm_5, clf_5, accuracy_5, test_label_5 = svm_classifier_sampling(train_dataset, test_dataset, 'tfidf', sampling_type = False, ngram_range = (1,2), use_params = False)

In [138]:
accuracy_5

0.5390781563126252

In [139]:
cm_5

array([[463,  36],
       [424,  75]])

In [163]:
train_dataset = train_dataset_raw.copy()
test_dataset = test_dataset_raw.copy()

train_dataset = data_preprocess(train_dataset, 'stem', stop_words_list='manual', spell_correction=True)
test_dataset = data_preprocess(test_dataset, 'stem', stop_words_list='manual', spell_correction=True)

In [166]:
label_pred_6, cm_6, clf_6, accuracy_6, test_label_6 = svm_classifier_sampling(train_dataset, test_dataset, 'tfidf', sampling_type = False, ngram_range = (1,2), use_params = False)

In [167]:
accuracy_6

0.5791583166332666

In [168]:
cm_6

array([[461,  38],
       [382, 117]])

In [11]:
train_dataset = train_dataset_raw.copy()
test_dataset = test_dataset_raw.copy()

train_dataset = data_preprocess(train_dataset, 'stem', stop_words_list='manual', spell_correction=False)
test_dataset = data_preprocess(test_dataset, 'stem', stop_words_list='manual', spell_correction=False)

In [206]:
label_pred_7, cm_7, clf_7, accuracy_7, test_label_7 = svm_classifier_sampling(train_dataset, test_dataset, 'tfidf', sampling_type = False, ngram_range = (1,3), use_params = False)

In [207]:
accuracy_7

0.6703406813627254

In [208]:
cm_7

array([[316, 183],
       [146, 353]])

In [210]:
label_pred_8, cm_8, clf_8, accuracy_8, test_label_8 = svm_classifier_sampling(train_dataset, test_dataset, 'tfidf', sampling_type = 'under', ngram_range = (1,3), use_params = False)

In [211]:
accuracy_8

0.6703406813627254

In [212]:
cm_8

array([[316, 183],
       [146, 353]])

In [213]:
label_pred_9, cm_9, clf_9, accuracy_9, test_label_9 = svm_classifier_sampling(train_dataset, test_dataset, 'tfidf', sampling_type = 'over', ngram_range = (1,3), use_params = False)

In [214]:
accuracy_9

0.6703406813627254

In [215]:
cm_9

array([[316, 183],
       [146, 353]])

In [None]:
label_pred_10, cm_10, clf_10, accuracy_10, test_label_10 = svm_classifier_sampling(train_dataset_raw, test_dataset_raw, 'tfidf', sampling_type = None, ngram_range = (1,3), use_params = False)

In [None]:
accuracy_10

In [None]:
cm_10