In [1]:
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.svm import SVC
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV
from gensim import matutils, corpora, models
import warnings; warnings.simplefilter('ignore')
import pandas as pd



In [2]:
def read_filenames(f):
    '''
    Read the file names of all data.
    '''
    file = open(f)
    names = [line.strip() for line in file]
    return names

def preprocess(files, root_dir, polarity):
    '''
    Preprocess the text files into labeled classes for future use.
    '''
    actual =[]
    labeled = []
    reviews = []
    for f in files:
        if f != "neg_list.txt" and f != "posi_list.txt":
            labeled.append(polarity)
            actual.append(str(f.split('_')[0]))
            reviews.append(str(open(root_dir + '/' + f).read()))
    data = pd.DataFrame({'labeled_class':labeled,'review':reviews,'actual_class':actual})
    return data

def extract_tokens(df):
    '''
    Convert reviews into lowercase, tag each word and lemmatize
    the words. Collect these information in a column named
    reviews_tokenized in a pandas data frame.
    Reference:
    https://pythonprogramming.net/natural-language-toolkit-nltk-part-speech-tagging/
    https://www.nltk.org/api/nltk.tokenize.html
    '''
    tokenized_review = []
    lmt = WordNetLemmatizer()
    for index, value in df.iterrows():
        tokenized = word_tokenize(value["review"].lower(),language='english')
        pos_word = pos_tag(tokenized)
        tokenized = ["_".join([lmt.lemmatize(i[0]),i[1]]) for i in pos_word if (i[0] not in stopwords.words("english") and len(i[0]) > 2)]
        tokenized_review.append(tokenized)
    df["tokenized_review"] = tokenized_review
    return df

def vectorize(df):
    '''
    Turn the reviews into vectors to feed as inputs for the SVM.
    Use corpara.Dictonary as the corpus and filter the words based
    on their frequencies. Create a bag of words model for the corpus
    then condense the corpus to sparse form.
    Reference: https://radimrehurek.com/gensim/corpora/dictionary.html
    '''
    dictionary = corpora.Dictionary(df["tokenized_review"])
    dictionary.filter_extremes(no_below=2, no_above=0.8)
    dictionary.compactify()
    corpus = [dictionary.doc2bow(t) for t in df["tokenized_review"]]
    corpus = matutils.corpus2csc(corpus, num_terms=len(dictionary.token2id))
    corpus = corpus.transpose()
    return dictionary, corpus

def train_svm(x,y):
    '''
    Use SVM Classifier and GridSearchCV with penalty parameters.
    Reference:
    https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
    https://github.com/rbgirshick/py-faster-rcnn/blob/master/tools/train_svms.py
    '''
    clf = GridSearchCV(SVC(), cv=4, param_grid={'C': [10,15,20,25], 'random_state':[2018]})
    clf.fit(x, y)
    return clf

In [3]:
negative_list = read_filenames("neg_list.txt")
positive_list = read_filenames("posi_list.txt")
negative_df = preprocess(negative_list,'negative_polarity','negative')
positive_df = preprocess(positive_list,'positive_polarity','positive')

# Categorize the reviews into four groups based on their polarity and truthfulness.
t1, t2 = [], []
for i in positive_df.index:
    if positive_df['labeled_class'][i] == 'positive':
        if positive_df['actual_class'][i] == 't':
            t1.append(2)
        else:
            t1.append(1)
for i in negative_df.index:
    if negative_df['labeled_class'][i] == 'negative':
        if negative_df['actual_class'][i] == 't':
            t2.append(3)
        else:
            t2.append(4)
positive_df['category'], negative_df['category'] = t1, t2
data = positive_df.merge(negative_df, how='outer')
data = data[['review', 'category']]

In [4]:
data = extract_tokens(data)
dictionary, corpus = vectorize(data)
pd.set_option('display.max_colwidth', -1)
data.head()

Unnamed: 0,review,category,tokenized_review
0,"After recent week stay at the Affinia Hotels, I can definitely say i will be coming back. They offer so many in room amenities and services, Just a very comfortable and relaxed place to be. My most enjoyable experience at the Affinia Hotel was the amazing customization they offered, I would recommend Affinia hotels to anyone looking for a nice place to stay .\n",1,"[recent_JJ, week_NN, stay_NN, affinia_NN, hotel_NNS, definitely_RB, say_VB, coming_VBG, back_RB, offer_VBP, many_JJ, room_NN, amenity_NNS, service_NNS, comfortable_JJ, relaxed_JJ, place_NN, enjoyable_JJ, experience_NN, affinia_JJ, hotel_NN, amazing_JJ, customization_NN, offered_VBD, would_MD, recommend_VB, affinia_JJ, hotel_NNS, anyone_NN, looking_VBG, nice_JJ, place_NN, stay_VB]"
1,"Although much too overpriced in my opinion, the hotel is spotless. The staff was very courteous. And the spa service ? Was a God send ! In a relatively flexible location for traveling for sight seeing so I didnt spend major bucks trying to get around the city ! LOVE IT ! Going back for my anniversary\n",1,"[although_IN, much_RB, overpriced_VBN, opinion_NN, hotel_NN, spotless_JJ, staff_NN, courteous_JJ, spa_NN, service_NN, god_JJ, send_NN, relatively_RB, flexible_JJ, location_NN, traveling_VBG, sight_NN, seeing_VBG, didnt_VBP, spend_VBP, major_JJ, buck_NNS, trying_VBG, get_VB, around_IN, city_NN, love_VB, going_VBG, back_RB, anniversary_JJ]"
2,"The Affinia hotel in Chicago was superb. the room service was exemplary and the food, I don't even know were to start. The chef obviously knew what he was doing, I especially loved the seafood, my personal favorite was the shrimp. Aside from this, I loved how beautiful the hotel was. It is definetly a bargain for the price, for that price you would probably get a good 3 star hotel but, I felt as if i was in a $10,000 a night 5 star hotel in the Europe. great bang for your buck. Would recommend it to anybody looking to relax at a great hotel with great amenities in a great city. My friends actually went their a while back because of me. They loved it! I know you will too.\n",1,"[affinia_JJ, hotel_NN, chicago_NN, superb_NN, room_NN, service_NN, exemplary_JJ, food_NN, n't_RB, even_RB, know_VB, start_VB, chef_NN, obviously_RB, knew_VBD, especially_RB, loved_VBN, seafood_NN, personal_JJ, favorite_NN, shrimp_NN, aside_RB, loved_VBD, beautiful_JJ, hotel_NN, definetly_RB, bargain_NN, price_NN, price_NN, would_MD, probably_RB, get_VB, good_JJ, star_JJ, hotel_NN, felt_VBD, 10,000_CD, night_NN, star_NN, hotel_NN, europe_NN, great_JJ, bang_NN, buck_NN, would_MD, recommend_VB, anybody_VB, looking_VBG, relax_VB, great_JJ, hotel_NN, great_JJ, amenity_NNS, great_JJ, city_NN, friend_NNS, actually_RB, went_VBD, back_RB, loved_VBD, know_VBP]"
3,"THIS HOTEL IS FANTASTIC. I stayed there on my way through Chicago towards Arizona, and could not believe the great quality of the hotel. I'd have thought I was in a Vegas suite. Really polite staff, great housekeeping, and amazing prices. On my way out, I was telling the manager about how much I'd loved the hotel, and he even offered me an extra night there! You can safely say that my trip was delayed roughly 24 hours.\n",1,"[hotel_NN, fantastic_JJ, stayed_VBD, way_NN, chicago_NN, towards_NNS, arizona_NN, could_MD, believe_VB, great_JJ, quality_NN, hotel_NN, thought_VBN, vega_NN, suite_NN, really_RB, polite_JJ, staff_NN, great_JJ, housekeeping_NN, amazing_VBG, price_NNS, way_NN, telling_VBG, manager_NN, much_JJ, loved_VB, hotel_NN, even_RB, offered_VBD, extra_JJ, night_NN, safely_RB, say_VB, trip_NN, delayed_VBN, roughly_RB, hour_NNS]"
4,"The Affinia Chicago is a wonderful place to stay, my husband and I stayed there for a week to visit some family and had an amazing time. The rooms were very well organized and comfortable, the staff there are very friendly, and the food there is more then amazing. we are defiantly going back next year.\n",1,"[affinia_NN, chicago_NN, wonderful_JJ, place_NN, stay_VB, husband_NN, stayed_VBP, week_NN, visit_VB, family_NN, amazing_JJ, time_NN, room_NNS, well_RB, organized_VBN, comfortable_JJ, staff_NN, friendly_JJ, food_NN, amazing_VBG, defiantly_RB, going_VBG, back_RP, next_JJ, year_NN]"


In [5]:
# Select 70% data for training and 30% data for testing using four-fold cross validation.
x_train, x_test, y_train, y_test = cross_validation.train_test_split(corpus, data["category"], test_size=0.3, random_state=2018)
svc_clf = train_svm(x_train,y_train)
print("Accuracy of SVM on training set is : {}".format(svc_clf.score(x_train,y_train)))
print("Accuracy of SVM on test set is : {}".format(svc_clf.score(x_test,y_test)))
print("Best accuracy of SVM on cross validation set :{}".format(svc_clf.best_score_))

Accuracy of SVM on training set is : 0.9428571428571428
Accuracy of SVM on test set is : 0.7583333333333333
Best accuracy of SVM on cross validation set :0.7473214285714286
