# Importing all the Libraries

In [1]:
import numpy as np
import pandas as pd
import glob
import re
import sklearn
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import string

<b>Creating a Pandas DataFrame

In [2]:
df= pd.read_csv(".\data_2.csv", skiprows =1, names = ['ID', 'Text', 'Aspect_Term', 'Term_Location', 'Class'], index_col = 'ID')
df.head()

Unnamed: 0_level_0,Text,Aspect_Term,Term_Location,Class
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3121_0,But the staff was so horrible to us.,staff,8--13,-1
2777_0,To be completely fair[comma] the only redeemin...,food,57--61,1
1634_0,The food is uniformly exceptional[comma] with ...,food,4--8,1
1634_1,The food is uniformly exceptional[comma] with ...,kitchen,55--62,1
1634_2,The food is uniformly exceptional[comma] with ...,menu,141--145,0


<b>Function to Preprocess the Data (Removing the StopWords and Performing Stemming)

In [3]:
from nltk.corpus import stopwords
import re
import nltk
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import *
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer

# tokenizer = RegexpTokenizer(r'\w+')

but = {'any', 'also', 'but', 'that' , 'hence', 'therefore', 'if', 'only', 'so'}
stop_words = set(stopwords.words('english'))
stop_words.add('comma')

def text_process(mess,aspect_term,stem):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    messg=str(mess)
    messg = messg.lower()
    aspect = str(aspect_term).lower()
    aspect = aspect.split()
    sentence = re.sub('\W+', ' ', messg)
    array = sentence.split()
    pos = 0
    a_pos = 0
    for i,val in enumerate(array):
        if val in but:
            pos = i
        if val in aspect:
            a_pos = i

    if pos < a_pos:
        messg = array[pos+1:a_pos+len(aspect)+1]
    else:
        messg = array[:a_pos + len(aspect) + 1]
    
    textt = word_tokenize(' '.join(messg))
#     print textt
    filtered_sentence = [w for w in textt if not w in stop_words]
#     print filtered_sentence
    
    if stem:
        ar1 = []
        ps = PorterStemmer()
        lanc = LancasterStemmer()
        lemma = nltk.wordnet.WordNetLemmatizer()
        sno = SnowballStemmer("english", ignore_stopwords=True)
        words = [sno.stem(w) for w in filtered_sentence]
        filtered_sentence = [ps.stem(w) for w in words]
        
    return filtered_sentence

In [4]:
clean_train_reviews=[] 
for i in xrange(0,len(df)):
    clean_train_reviews.append(" ".join(text_process(df["Text"][i],df["Aspect_Term"][i],False)))

<b>Data after the Preprocesssing Step

In [5]:
print clean_train_reviews[0]

staff


<b>Creating the Feature Set and Class

In [6]:
X = clean_train_reviews
y = df.Class

<b>Instantiating the Vector and Transforming the data

In [7]:
# from sklearn.feature_extraction.text import CountVectorizer
# vect = CountVectorizer()
# vect.fit(X)
# vect.get_feature_names()
# simple_train_dtm = vect.transform(X)
# print simple_train_dtm.toarray()

In [8]:
# pd.DataFrame(simple_train_dtm.toarray(), columns=vect.get_feature_names())

In [9]:
# from sklearn.feature_extraction.text import CountVectorizer
# vect = CountVectorizer(stop_words = 'english', lowercase=True)
vect = TfidfVectorizer( ngram_range=(1,3), min_df=1,strip_accents='unicode')
X_dtm = vect.fit_transform(X)

In [10]:
# text_clf = Pipeline([('vect', TfidfVectorizer(ngram_range=(1,5),min_df=1,strip_accents='unicode')),
#                      #('feature_selection', SelectFromModel(svm.LinearSVC(penalty="l1",dual=False))),
#                      #('feature_selection', SelectFromModel(RandomForestClassifier())),
#                      ('clf', svm.LinearSVC())
#                     ])


# text_clf = text_clf.fit(X,y)

In [11]:
# split X and y into training and testing sets
# from sklearn.cross_validation import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
# print(X_train.shape)
# print(X_test.shape)
# print(y_train.shape)
# print(y_test.shape)

In [12]:
# vect.fit(X_train)
# X_train_dtm = vect.fit_transform(X_train)
# X_test_dtm = vect.transform(X_test)

<b>Prediction

In [13]:
from sklearn.metrics import classification_report, accuracy_score, make_scorer

def classification_report_with_accuracy_score(y_true, y_pred):
    print classification_report(y_true, y_pred) # print classification report
    return accuracy_score(y_true, y_pred) # return accuracy score

In [14]:
from sklearn import svm
clf = svm.SVC(kernel='linear')
nested_score = cross_val_score(clf , X_dtm , y, cv =10, scoring=make_scorer(classification_report_with_accuracy_score))
print nested_score

             precision    recall  f1-score   support

         -1       0.73      0.23      0.36        81
          0       0.36      0.06      0.11        64
          1       0.65      0.98      0.78       217

avg / total       0.62      0.65      0.57       362

             precision    recall  f1-score   support

         -1       0.47      0.17      0.25        81
          0       0.44      0.12      0.20        64
          1       0.64      0.93      0.76       217

avg / total       0.57      0.62      0.54       362

             precision    recall  f1-score   support

         -1       0.46      0.14      0.21        81
          0       0.32      0.11      0.16        64
          1       0.66      0.95      0.78       217

avg / total       0.55      0.62      0.54       362

             precision    recall  f1-score   support

         -1       0.50      0.11      0.18        81
          0       0.33      0.08      0.13        63
          1       0.62      0.94    

In [15]:
from sklearn import svm
clf = svm.SVC(kernel='linear')
scores = cross_val_score(clf , X_dtm , y, cv =10 , scoring = 'accuracy')
print scores.mean()

0.6176899742015285


In [16]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
nested_score = cross_val_score(clf , X_dtm , y, cv =10, scoring=make_scorer(classification_report_with_accuracy_score))
print nested_score.mean(keepdims=True)

             precision    recall  f1-score   support

         -1       0.50      0.05      0.09        81
          0       0.00      0.00      0.00        64
          1       0.61      1.00      0.76       217

avg / total       0.48      0.61      0.48       362

             precision    recall  f1-score   support

         -1       0.50      0.01      0.02        81
          0       0.00      0.00      0.00        64
          1       0.60      1.00      0.75       217

avg / total       0.47      0.60      0.46       362

             precision    recall  f1-score   support

         -1       0.56      0.06      0.11        81
          0       0.00      0.00      0.00        64
          1       0.62      0.99      0.76       217

avg / total       0.49      0.61      0.48       362

             precision    recall  f1-score   support

         -1       0.33      0.02      0.05        81
          0       0.00      0.00      0.00        63
          1       0.61      1.00    

  'precision', 'predicted', average, warn_for)


In [15]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=5)
nested_score = cross_val_score(clf , X_dtm , y, cv =10, scoring=make_scorer(classification_report_with_accuracy_score))
print nested_score

             precision    recall  f1-score   support

         -1       0.46      0.32      0.38        81
          0       0.15      0.06      0.09        64
          1       0.65      0.84      0.73       217

avg / total       0.52      0.59      0.54       362

             precision    recall  f1-score   support

         -1       0.35      0.36      0.35        81
          0       0.44      0.23      0.31        64
          1       0.67      0.75      0.71       217

avg / total       0.56      0.57      0.56       362

             precision    recall  f1-score   support

         -1       0.37      0.49      0.42        81
          0       0.29      0.11      0.16        64
          1       0.70      0.74      0.72       217

avg / total       0.55      0.57      0.55       362

             precision    recall  f1-score   support

         -1       0.35      0.11      0.17        81
          0       0.32      0.11      0.16        63
          1       0.64      0.92    

In [18]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
nested_score = cross_val_score(clf , X_dtm , y, cv =10, scoring=make_scorer(classification_report_with_accuracy_score))
print nested_score.mean(keepdims=True)

             precision    recall  f1-score   support

         -1       0.34      0.19      0.24        81
          0       0.27      0.05      0.08        64
          1       0.65      0.92      0.76       217

avg / total       0.51      0.60      0.52       362

             precision    recall  f1-score   support

         -1       0.37      0.17      0.24        81
          0       0.38      0.09      0.15        64
          1       0.65      0.93      0.77       217

avg / total       0.54      0.61      0.54       362

             precision    recall  f1-score   support

         -1       0.39      0.16      0.23        81
          0       0.45      0.14      0.21        64
          1       0.66      0.94      0.77       217

avg / total       0.56      0.62      0.55       362

             precision    recall  f1-score   support

         -1       0.28      0.10      0.15        81
          0       0.23      0.05      0.08        63
          1       0.61      0.90    