## Data cleaning

Aspect based sentiment analysis. Project - 2 of CS 583 - Data Mining and Text Mining taught by professor Bing Liu at UIC.

Importing required packages.

In [2]:
# imports
import csv
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn import cross_validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_predict
from sklearn import svm
from nltk.stem import WordNetLemmatizer
from nltk.tag.stanford import StanfordPOSTagger
from nltk.parse.stanford import StanfordDependencyParser
import re
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn import grid_search
import nltk



Knowledge and tools required for pre-processing.

In [89]:
# data required for pre processing
happy_emoji = [':‑)',':)',':-]',':]',':-3',':3',':->',':>','8-)','8)',':-}', ':}',':o)',':c)',':^)','=]','=)',
                  ':‑D',':D','8‑D','8D','x‑D','xD','X‑D','XD','=D','=3','B^D',':-))']
                
sad_emoji = [':‑(',':(',':‑c',':c',':‑<',':<',':‑[',':[',':-||','>:[',':{',':@','>:(']
    
negative_words = ["doesn't", "isn't", "wasn't", "shouldn't", "wouldn't", "couldn't", "won't", "can't", "don't"]
    
stop_words = set(stopwords.words('english'))
    
# pre-processing
tokenizer = RegexpTokenizer(r'\w+')
wnl = WordNetLemmatizer()
st = StanfordPOSTagger('english-left3words-distsim.tagger')

The StanfordTokenizer will be deprecated in version 3.2.5.
Please use nltk.tag.corenlp.CoreNLPPOSTagger or nltk.tag.corenlp.CoreNLPNERTagger instead.
  super(StanfordPOSTagger, self).__init__(*args, **kwargs)


# DATA 1

### Reading the data

In [217]:
with open('Data_1_train.csv', newline='') as csvfile1:
    reader1 = csv.DictReader(csvfile1)
    """
    for row in reader2:
        print(row['example_id'])
        print(row[' text'])
        print(row[' aspect_term'])
        print(row[' term_location'])
        print(row[' class'])
        print()        
    """
    
    
    singles1 = []
    texts1 = []
    tempText1 = ''
    classes1 = []
    aspectTerm1 = []
    
    for row in reader1:
        # aspect term
        aspTL = []
        for asp in tokenizer.tokenize(row[' aspect_term']):
            aspTL.append(wnl.lemmatize(asp))
        
        aspectTerm1.append(aspTL)
        
        # labels
        classes1.append(row[' class'])
        
        # review text
        tempText1 = row[' text'].replace('[comma]',',').lower()
               
        for hem in happy_emoji:
            if hem in tempText1:
                tempText1 = tempText1.replace(hem, 'happy')
        
        for sem in sad_emoji:
            if sem in tempText1:
                tempText1 = tempText1.replace(sem, 'sad')
                
        for negw in negative_words:
            if negw in tempText1:
                tempText1 = tempText1.replace(negw, 'not')
        
        for t in tokenizer.tokenize(tempText1):
            if t not in stop_words or t == 'not':
                singles1.append(wnl.lemmatize(t))
        
        texts1.append(singles1)
        singles1 = []
        
    newText1 = []
    
    for tk in texts1:
        newText1.append(' '.join(tk))

## Approach

### Sentiment analysis without considering aspect

Tf-idf vectorize the review text.

In [218]:
tfi = TfidfVectorizer(ngram_range=(1, 1))
tfidf_data1 = tfi.fit_transform(newText1)

In [185]:
tfidf_data1

<2203x2884 sparse matrix of type '<class 'numpy.float64'>'
	with 22058 stored elements in Compressed Sparse Row format>

Using Bernoulli Naive Bayes with cross validation.

In [186]:
clf1NB = BernoulliNB()
predicted1NB = cross_val_predict(clf1NB, tfidf_data1, classes1, cv=10)

print(classification_report(classes1,predicted1NB))
print("The accuracy score is {:.2%}".format(metrics.accuracy_score(classes1, predicted1NB)))

             precision    recall  f1-score   support

         -1       0.69      0.75      0.72       828
          0       0.64      0.32      0.43       436
          1       0.73      0.84      0.78       939

avg / total       0.70      0.70      0.69      2203

The accuracy score is 70.31%


Now we will try the SVM but before doing that we use `GridSearchCV` for parameter tuning.

In [99]:
parameters = {'kernel':('linear', 'rbf', 'sigmoid', 'poly'), 'C':[0.001, 0.01, 0.1, 1, 2, 3, 4, 5, 10]}
clf1SVCGS = grid_search.GridSearchCV(svm.SVC(), parameters, cv=10)
clf1SVCGS.fit(tfidf_data1, classes1)
print(clf1SVCGS.best_params_)

{'C': 2, 'kernel': 'linear'}


Now that we have the optimum parameters, we can pass the same data to a linear SVM with 10 fold cross validation.

In [219]:
clf1SVC = svm.SVC(kernel='linear', C=2, random_state=0)
predicted1SVC = cross_val_predict(clf1SVC, tfidf_data1, classes1, cv=10)

print(classification_report(classes1, predicted1SVC))
print("The accuracy score is {:.2%}".format(metrics.accuracy_score(classes1, predicted1SVC)))

             precision    recall  f1-score   support

         -1       0.72      0.78      0.75       828
          0       0.57      0.47      0.52       436
          1       0.79      0.80      0.79       939

avg / total       0.72      0.73      0.72      2203

The accuracy score is 72.54%


### If using this as final classifier.

In [188]:
clf1Final = svm.SVC(kernel='linear', C=2, random_state=0)
clf1Final.fit(tfidf_data1, classes1)

SVC(C=2, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

## Approach

### Read POS tagged data

Read from file and find nouns, adjectives, adverbs in the review text.

Now we do POS tagging of the pre-processed text and save the generated POS tags in a text file.

In [91]:
textPOS1 = []
    
for nt in newText1:
    textPOS1.append(nltk.pos_tag(nt.split()))
        
# save pos tags to txt file
f1pos = open('PosTagsData1Full.txt','w')
for row in textPOS1:
    f1pos.write(str(row)+"\n")
f1pos.close()

We read the pos tags data from txt file.

In [92]:
combinedList1 = ['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJS', 'JJR', 'RB', 'RBR', 'RBS']

posTextList1 = []

finalReviewList1 = []
strTemp1 = ""

fpos1 = open('PosTagsData1Full.txt','r')
postags1 = fpos1.readline()

while postags1:            
    posTextList1 = re.findall(r"[\w]+", postags1)
    
    for i in range(len(posTextList1)):
        if posTextList1[i] in combinedList1:
            strTemp1 = strTemp1 + str(' ') + posTextList1[i - 1]
    finalReviewList1.append(strTemp1)
    strTemp1 = ""

    postags1 = fpos1.readline()
fpos1.close()

Now we have the new review text in a list named `finalReviewList1`. Now we use this list for count and tfidf transformation (which is equivalent to tfidf verctorization).

In [93]:
count_vectorizer1 = CountVectorizer(ngram_range=(1, 3))
data_tfidf1 = count_vectorizer2.fit_transform(finalReviewList1)
tfidf_data1 = TfidfTransformer(use_idf=False).fit_transform(data_tfidf1)

After vectorization, we now pass this data to Naive Bayes classifier with 10 fold cross validation.

In [94]:
clf1NB = BernoulliNB()
predicted1NB = cross_val_predict(clf1NB, tfidf_data1, classes1, cv=10)

print(classification_report(classes1,predicted1NB))
print("The accuracy score is {:.2%}".format(metrics.accuracy_score(classes1, predicted1NB)))

             precision    recall  f1-score   support

         -1       0.69      0.80      0.74       828
          0       0.65      0.24      0.35       436
          1       0.72      0.83      0.77       939

avg / total       0.69      0.70      0.68      2203

The accuracy score is 70.22%


Now we will try the SVM but before doing that we use `GridSearchCV` for parameter tuning.

In [95]:
parameters = {'kernel':('linear', 'rbf', 'sigmoid', 'poly'), 'C':[0.001, 0.01, 0.1, 1, 2, 3, 4, 5, 10]}
clf1SVCGS = grid_search.GridSearchCV(svm.SVC(), parameters, cv=10)
clf1SVCGS.fit(tfidf_data1, classes1)
print(clf1SVCGS.best_params_)

{'C': 2, 'kernel': 'linear'}


Now that we have the optimum parameters, we can pass the same data to a linear SVM with 10 fold cross validation.

In [96]:
clf1SVC = svm.SVC(kernel='linear', C=2, random_state=0)
predicted1SVC = cross_val_predict(clf1SVC, tfidf_data1, classes1, cv=10)

print(classification_report(classes1, predicted1SVC))
print("The accuracy score is {:.2%}".format(metrics.accuracy_score(classes1, predicted1SVC)))

             precision    recall  f1-score   support

         -1       0.72      0.78      0.75       828
          0       0.61      0.48      0.54       436
          1       0.78      0.80      0.79       939

avg / total       0.72      0.73      0.72      2203

The accuracy score is 72.86%


### If using this as final classifier.

In [188]:
clf1Final = svm.SVC(kernel='linear', C=2, random_state=0)
clf1Final.fit(tfidf_data1, classes1)

SVC(C=2, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

## Approach

### Read dependency parsed data

Read dependency parsing data and extract nouns and the adjectives that affect them. We use the code below to get the dependency parsing output as a list.

Now we do the dependecy parsing using stanford corenlp.

In [63]:
depParsingList1 = []
    
dep_parser1=StanfordDependencyParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")

# send it in parts so that jvm does not run out of memory
for nt in newText1[0:1000]:
    result1 = dep_parser1.raw_parse(nt)    
    dep1 = result1.__next__()    
    depParsingList1.append(list(dep1.triples()))
    
fdep1 = open('depParseData1P1.txt','w')
for dp in depParsingList1:
    fdep1.write(str(dp)+"\n")
fdep1.close()

As we now have our dependency parsed list in the txt file so now we can extract the aspect terms and the adjectives and adverbs which affect them.

In [65]:
fulldeplist1 = []

fdep1 = open('depParseData2Final.txt','r')
deppars1 = fdep2.readline()
while deppars1:
    deppars1 = deppars1.replace('[','')
    deppars1 = deppars1.replace(']','')

    striptxt1 = deppars1.split(')),')
    
    relationlist1 = []
    
    for st in striptxt1:
        relationlist1.append(re.findall(r"[\w]+", st))
        
    fulldeplist1.append(relationlist1)

    deppars1 = fdep1.readline()
fdep1.close()

Now we take the dependency parsed list and extract noun-adjective and noun-adverb pairs for each review.

In [67]:
nounList = ['NN', 'NNS', 'NNP', 'NNPS']
adjAdvList = ['JJ', 'JJS', 'JJR', 'RB', 'RBR', 'RBS']
newReviewText1 = ""
newDepList1 = []
cnt1 = 0
remClasses1 = []

for l in fulldeplist1:
    for m in l:
        if len(m) > 4:
            if m[1] in nounList:
                if m[4] in adjAdvList:
                    newReviewText1 = newReviewText1 + str(' ') + m[3] + str(' ') + m[0]

            elif m[4] in nounList:
                if m[1] in adjAdvList:
                    newReviewText1 = newReviewText1 + str(' ') + m[0] + str(' ') + m[3]

    if newReviewText1 != "":
        remClasses1.append(classes1[cnt2])
        newDepList1.append(newReviewText1)
    newReviewText1 = ""
    cnt1 += 1

Then we do Tfidf vectorization (count vectorization and tfidf transformation) so that they can be used in Naive Bayes and SVM classifier.

In [80]:
count_vectorizer1 = CountVectorizer(ngram_range=(1, 2))
data_tfidf1 = count_vectorizer2.fit_transform(newDepList1)
tfidf_data1 = TfidfTransformer(use_idf=False).fit_transform(data_tfidf1)

Now train a Bernoulli Naive Bayes classifier also doing 10 fold cross validation.

In [81]:
clf1NBA2 = BernoulliNB()
predicted1NBA2 = cross_val_predict(clf1NBA2, tfidf_data1, remClasses1, cv=10)

print(classification_report(remClasses1,predicted1NBA2))
print("The accuracy score is {:.2%}".format(metrics.accuracy_score(remClasses1, predicted1NBA2)))

             precision    recall  f1-score   support

         -1       0.36      0.22      0.28       666
          0       0.14      0.08      0.10       473
          1       0.64      0.80      0.71      1864

avg / total       0.50      0.56      0.52      3003

The accuracy score is 55.61%


Now we use `GridSearchCV` to find the optimal parameters for SVM.

In [82]:
parameters = {'kernel':('linear', 'rbf', 'sigmoid', 'poly'), 'C':[0.001, 0.01, 0.1, 1, 2, 3, 4, 5, 10]}
clf1SVCA2GS = grid_search.GridSearchCV(svm.SVC(), parameters, cv=10)
clf1SVCA2GS.fit(tfidf_data1, remClasses1)
print(clf1SVCA2GS.best_params_)

{'C': 1, 'kernel': 'linear'}


Now we train SVM classifier with optimal parameters also doing 10 fold cross validation.

In [83]:
clf1SVCA2 = svm.SVC(kernel='linear', C=1, random_state=0)
predicted1SVCA2 = cross_val_predict(clf1SVCA2, tfidf_data1, remClasses1, cv=10)

print(classification_report(remClasses1,predicted1SVCA2))
print("The accuracy score is {:.2%}".format(metrics.accuracy_score(remClasses1, predicted1SVCA2)))

             precision    recall  f1-score   support

         -1       0.54      0.34      0.42       666
          0       0.26      0.08      0.12       473
          1       0.70      0.91      0.79      1864

avg / total       0.59      0.65      0.60      3003

The accuracy score is 65.33%


### If using this as final classifier.

In [188]:
clf1Final = svm.SVC(kernel='linear', C=1, random_state=0)
clf1Final.fit(tfidf_data1, classes1)

SVC(C=2, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

# DATA 2

### Reading the data

In [204]:
with open('Data_2_train.csv', newline='') as csvfile2:
    reader2 = csv.DictReader(csvfile2)
    """
    for row in reader2:
        print(row['example_id'])
        print(row[' text'])
        print(row[' aspect_term'])
        print(row[' term_location'])
        print(row[' class'])
        print()        
    """
    
    
    singles2 = []
    texts2 = []
    tempText2 = ''
    classes2 = []
    aspectTerm2 = []
    
    for row in reader2:
        # aspect term
        aspTL = []
        for asp in tokenizer.tokenize(row[' aspect_term']):
            aspTL.append(wnl.lemmatize(asp))
        
        aspectTerm2.append(aspTL)
        
        # labels
        classes2.append(row[' class'])
        
        # review text
        tempText2 = row[' text'].replace('[comma]',',').lower()
               
        for hem in happy_emoji:
            if hem in tempText2:
                tempText2 = tempText2.replace(hem, 'happy')
        
        for sem in sad_emoji:
            if sem in tempText2:
                tempText2 = tempText2.replace(sem, 'sad')
                
        for negw in negative_words:
            if negw in tempText2:
                tempText2 = tempText2.replace(negw, 'not')
        
        for t in tokenizer.tokenize(tempText2):
            if t not in stop_words or t == 'not':
                singles2.append(wnl.lemmatize(t))
        
        texts2.append(singles2)
        singles2 = []
        
    newText2 = []
    
    for tk in texts2:
        newText2.append(' '.join(tk))

## Approach

### Sentiment analysis without considering aspect

Tf-idf vectorize the review text.

In [205]:
#count_vectorizer2 = CountVectorizer(ngram_range=(1, 1))
#data_tfidf2 = count_vectorizer2.fit_transform(newText2)
#tfidf_data2 = TfidfTransformer(use_idf=False).fit_transform(data_tfidf2)

tfi2 = TfidfVectorizer(ngram_range=(1, 1))
tfidf_data2 = tfi2.fit_transform(newText2)

Using Bernoulli Naive Bayes with cross validation.

In [206]:
clf2NB = BernoulliNB()
predicted2NB = cross_val_predict(clf2NB, tfidf_data2, classes2, cv=10)

print(classification_report(classes2,predicted2NB))
print("The accuracy score is {:.2%}".format(metrics.accuracy_score(classes2, predicted2NB)))

             precision    recall  f1-score   support

         -1       0.49      0.33      0.40       805
          0       0.28      0.20      0.23       633
          1       0.70      0.84      0.76      2164

avg / total       0.58      0.61      0.59      3602

The accuracy score is 61.35%


Now we will try the SVM but before doing that we use `GridSearchCV` for parameter tuning.

In [107]:
parameters = {'kernel':('linear', 'rbf', 'sigmoid', 'poly'), 'C':[0.001, 0.01, 0.1, 1, 2, 3, 4, 5, 10]}
clf2SVCGS = grid_search.GridSearchCV(svm.SVC(), parameters, cv=10)
clf2SVCGS.fit(tfidf_data2, classes2)
print(clf2SVCGS.best_params_)

{'C': 1, 'kernel': 'linear'}


Now that we have the optimum parameters, we can pass the same data to a linear SVM with 10 fold cross validation.

In [207]:
clf2SVC = svm.SVC(kernel='linear', C=1, random_state=0)
predicted2SVC = cross_val_predict(clf2SVC, tfidf_data2, classes2, cv=10)

print(classification_report(classes2, predicted2SVC))
print("The accuracy score is {:.2%}".format(metrics.accuracy_score(classes2, predicted2SVC)))

             precision    recall  f1-score   support

         -1       0.55      0.43      0.48       805
          0       0.46      0.27      0.34       633
          1       0.74      0.89      0.81      2164

avg / total       0.65      0.68      0.65      3602

The accuracy score is 67.66%


### If using this as final classifier.

In [208]:
clf2Final = svm.SVC(kernel='linear', C=1, random_state=0)
clf2Final.fit(tfidf_data2, classes2)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

## Approach

### Read POS tagged data

Read from file and find nouns, adjectives, adverbs in the review text.

Now we do POS tagging of the pre-processed text and save the generated POS tags in a text file.

In [8]:
import nltk
# POS tagging
textPOS2 = []
    
for nt in newText2:
    textPOS2.append(nltk.pos_tag(nt.split()))
        
# save pos tags to txt file
f2pos = open('PostagsData2Full.txt','w')
for row in textPOS2:
    f2pos.write(str(row)+"\n")
f2pos.close()

Now we read the pos tags from the txt file.

In [40]:
combinedList2 = ['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJS', 'JJR', 'RB', 'RBR', 'RBS']

posTextList2 = []

finalReviewList2 = []
strTemp2 = ""

fpos2 = open('postagsData2Full.txt','r')
postags2 = fpos2.readline()

while postags2:            
    posTextList2 = re.findall(r"[\w]+", postags2)
    
    for i in range(len(posTextList2)):
        if posTextList2[i] in combinedList2:
            strTemp2 = strTemp2 + str(' ') + posTextList2[i - 1]
    finalReviewList2.append(strTemp2)
    strTemp2 = ""

    postags2 = fpos2.readline()
fpos2.close()

Now we have the new review text in a list named `finalReviewList2`. Now we use this list for count and tfidf transformation (which is equivalent to tfidf verctorization).

In [41]:
count_vectorizer2 = CountVectorizer(ngram_range=(1, 2))
data_tfidf2 = count_vectorizer2.fit_transform(finalReviewList2)
tfidf_data2 = TfidfTransformer(use_idf=False).fit_transform(data_tfidf2)

After vectorization, we now pass this data to Naive Bayes classifier with 10 fold cross validation.

In [42]:
clf2 = BernoulliNB()
predicted2 = cross_val_predict(clf2, tfidf_data2, classes2, cv=10)

print(classification_report(classes2,predicted2))
print("The accuracy score is {:.2%}".format(metrics.accuracy_score(classes2, predicted2)))

             precision    recall  f1-score   support

         -1       0.44      0.25      0.32       805
          0       0.20      0.10      0.13       633
          1       0.65      0.85      0.73      2164

avg / total       0.52      0.58      0.53      3602

The accuracy score is 58.22%


Now we will try the SVM but before doing that we use `GridSearchCV` for parameter tuning.

In [18]:
parameters = {'kernel':('linear', 'rbf', 'sigmoid', 'poly'), 'C':[0.001, 0.01, 0.1, 1, 2, 3, 4, 5, 10]}
clf2 = grid_search.GridSearchCV(svm.SVC(), parameters, cv=10)
clf2.fit(tfidf_data2, classes2)
print(clf2.best_params_)

{'C': 1, 'kernel': 'linear'}


Now that we have the optimum parameters, we can pass the same data to a linear SVM with 10 fold cross validation.

In [43]:
clf2 = svm.SVC(kernel='linear', C=1, random_state=0)
predicted2 = cross_val_predict(clf2, tfidf_data2, classes2, cv=10)

print(classification_report(classes2, predicted2))
print("The accuracy score is {:.2%}".format(metrics.accuracy_score(classes2, predicted2)))

             precision    recall  f1-score   support

         -1       0.54      0.40      0.46       805
          0       0.42      0.19      0.27       633
          1       0.71      0.89      0.79      2164

avg / total       0.62      0.66      0.62      3602

The accuracy score is 65.80%


### If using this as final classifier.

In [208]:
clf2Final = svm.SVC(kernel='linear', C=1, random_state=0)
clf2Final.fit(tfidf_data2, classes2)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

## Approach

### Read dependency parsed data

Read dependency parsing data and extract nouns and the adjectives that affect them. We use the code below to get the dependency parsing output as a list.

Now we do the dependecy parsing using stanford corenlp.

In [63]:
depParsingList2 = []
    
dep_parser2=StanfordDependencyParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")

for nt in newText2[3500:3602]:
    result2 = dep_parser2.raw_parse(nt)    
    dep2 = result2.__next__()    
    depParsingList2.append(list(dep2.triples()))
    
fdep2 = open('depParseData2P8.txt','w')
for dp in depParsingList2:
    fdep2.write(str(dp)+"\n")
fdep2.close()

As we now have our dependency parsed list in the txt file so now we can extract the aspect terms and the adjectives and adverbs which affect them.

In [65]:
fulldeplist2 = []

fdep2 = open('depParseData2Final.txt','r')
deppars2 = fdep2.readline()
while deppars2:
    #code for parsing dep
    deppars2 = deppars2.replace('[','')
    deppars2 = deppars2.replace(']','')

    striptxt2 = deppars2.split(')),')
    
    relationlist2 = []
    
    for st in striptxt2:
        relationlist2.append(re.findall(r"[\w]+", st))
        
    fulldeplist2.append(relationlist2)

    deppars2 = fdep2.readline()
fdep2.close()

Now we take the dependency parsed list and extract noun-adjective and noun-adverb pairs for each review.

In [67]:
nounList = ['NN', 'NNS', 'NNP', 'NNPS']
adjAdvList = ['JJ', 'JJS', 'JJR', 'RB', 'RBR', 'RBS']
newReviewText2 = ""
newDepList2 = []
cnt2 = 0
remClasses2 = []

for l in fulldeplist2:
    for m in l:
        if len(m) > 4:
            if m[1] in nounList:
                if m[4] in adjAdvList:
                    newReviewText2 = newReviewText2 + str(' ') + m[3] + str(' ') + m[0]

            elif m[4] in nounList:
                if m[1] in adjAdvList:
                    newReviewText2 = newReviewText2 + str(' ') + m[0] + str(' ') + m[3]

    if newReviewText2 != "":
        remClasses2.append(classes2[cnt2])
        newDepList2.append(newReviewText2)
    newReviewText2 = ""
    cnt2 += 1

Then we do Tfidf vectorization (count vectorization and tfidf transformation) so that they can be used in Naive Bayes and SVM classifier.

In [80]:
count_vectorizer2 = CountVectorizer(ngram_range=(1, 2))
data_tfidf2 = count_vectorizer2.fit_transform(newDepList2)
tfidf_data2 = TfidfTransformer(use_idf=False).fit_transform(data_tfidf2)

Now train a Bernoulli Naive Bayes classifier also doing 10 fold cross validation.

In [81]:
clf2 = BernoulliNB()
predicted2 = cross_val_predict(clf2, tfidf_data2, remClasses2, cv=10)

print(classification_report(remClasses2,predicted2))
print("The accuracy score is {:.2%}".format(metrics.accuracy_score(remClasses2, predicted2)))

             precision    recall  f1-score   support

         -1       0.36      0.22      0.28       666
          0       0.14      0.08      0.10       473
          1       0.64      0.80      0.71      1864

avg / total       0.50      0.56      0.52      3003

The accuracy score is 55.61%


Now we use `GridSearchCV` to find the optimal parameters for SVM.

In [82]:
parameters = {'kernel':('linear', 'rbf', 'sigmoid', 'poly'), 'C':[0.001, 0.01, 0.1, 1, 2, 3, 4, 5, 10]}
clf = grid_search.GridSearchCV(svm.SVC(), parameters, cv=10)
clf.fit(tfidf_data2, remClasses2)
print(clf.best_params_)

{'C': 1, 'kernel': 'linear'}


Now we train SVM classifier with optimal parameters also doing 10 fold cross validation.

In [83]:
clf = svm.SVC(kernel='linear', C=1, random_state=0)
predicted = cross_val_predict(clf, tfidf_data2, remClasses2, cv=10)

print(classification_report(remClasses2,predicted))
print("The accuracy score is {:.2%}".format(metrics.accuracy_score(remClasses2, predicted)))

             precision    recall  f1-score   support

         -1       0.54      0.34      0.42       666
          0       0.26      0.08      0.12       473
          1       0.70      0.91      0.79      1864

avg / total       0.59      0.65      0.60      3003

The accuracy score is 65.33%


### If using this as final classifier.

In [208]:
clf2Final = svm.SVC(kernel='linear', C=1, random_state=0)
clf2Final.fit(tfidf_data2, classes2)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

# Test Data

#### For Test Data 1:

Reading data from csv file and pre-processing it for prediction.

In [220]:
with open('Data-1_test.csv', newline='') as csvfile1:
    reader1 = csv.DictReader(csvfile1)
    '''
    for row in reader1:
        print(row['example_id'])
        print(row[' text'])
        print(row[' aspect_term'])
        print(row[' term_location'])
        print()        
    '''
    
     
    singles1 = []
    texts1 = []
    tempText1 = ''
    aspectTerm1 = []
    ids = []
    
    for row in reader1:
        #eample id
        ids.append(row['example_id'])
        
        # aspect term
        aspTL = []
        for asp in tokenizer.tokenize(row[' aspect_term']):
            aspTL.append(wnl.lemmatize(asp))
        
        aspectTerm1.append(aspTL)
        
        # review text
        tempText1 = row[' text'].replace('[comma]',',').lower()
               
        for hem in happy_emoji:
            if hem in tempText1:
                tempText1 = tempText1.replace(hem, 'happy')
        
        for sem in sad_emoji:
            if sem in tempText1:
                tempText1 = tempText1.replace(sem, 'sad')
                
        for negw in negative_words:
            if negw in tempText1:
                tempText1 = tempText1.replace(negw, 'not')
        
        for t in tokenizer.tokenize(tempText1):
            if t not in stop_words or t == 'not':
                singles1.append(wnl.lemmatize(t))
        
        texts1.append(singles1)
        singles1 = []
        
    newText1 = []
    
    for tk in texts1:
        newText1.append(' '.join(tk))

Converting to Tf-idf vectors.

In [221]:
tt = tfi.transform(newText1)

Predicting using trained classifier.

In [222]:
predicted1 = clf1Final.predict(tt)

Printing the final output txt file for data 1.

In [223]:
fout1 = open('outData2.txt','w')

cnt1 = 0

for dp in predicted1:
    fout1.write(str(ids[cnt1]) + ";;" +str(dp)+"\n")
    cnt1 += 1
fout1.close()

#### For Test Data 2:

Reading data from csv file and pre-processing it for prediction.

In [213]:
with open('Data-2_test.csv', newline='') as csvfile2:
    reader2 = csv.DictReader(csvfile2)
    """
    for row in reader2:
        print(row['example_id'])
        print(row[' text'])
        print(row[' aspect_term'])
        print(row[' term_location'])
        print(row[' class'])
        print()        
    """
    
    singles2 = []
    texts2 = []
    tempText2 = ''
    aspectTerm2 = []
    ids2 = []
    
    for row in reader2:
        ids2.append(row['example_id'])
        
        # aspect term
        aspTL = []
        for asp in tokenizer.tokenize(row[' aspect_term']):
            aspTL.append(wnl.lemmatize(asp))
        
        aspectTerm2.append(aspTL)
        
        # review text
        tempText2 = row[' text'].replace('[comma]',',').lower()
               
        for hem in happy_emoji:
            if hem in tempText2:
                tempText2 = tempText2.replace(hem, 'happy')
        
        for sem in sad_emoji:
            if sem in tempText2:
                tempText2 = tempText2.replace(sem, 'sad')
                
        for negw in negative_words:
            if negw in tempText2:
                tempText2 = tempText2.replace(negw, 'not')
        
        for t in tokenizer.tokenize(tempText2):
            if t not in stop_words or t == 'not':
                singles2.append(wnl.lemmatize(t))
        
        texts2.append(singles2)
        singles2 = []
        
    newText2 = []
    
    for tk in texts2:
        newText2.append(' '.join(tk))

Converting to Tf-idf vectors.

In [214]:
tt2 = tfi2.transform(newText2)

Predicting using trained classifier.

In [215]:
predicted2 = clf2Final.predict(tt2)

Printing the final output txt file for data 2.

In [216]:
fout2 = open('outData2.txt','w')

cnt2 = 0

for dp in predicted2:
    fout2.write(str(ids2[cnt2]) + ";;" + str(dp) + "\n")
    cnt2 += 1
fout2.close()