# Make sure all packages are installed by running the first cell 

In [41]:
import re
import collections
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import pandas as pd
from sklearn import metrics
from itertools import chain
from collections import Counter

import pycrfsuite
from sklearn.preprocessing import LabelBinarizer

# Run this cell to download nltk data if you haven't done it already

In [42]:
nltk.download('all') #or just 'stopwords' and 'snowball_data' instead of 'all'

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\lw39km\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\lw39km\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     C:\Users\lw39km\AppData\Roaming\nltk_data...
[nltk_data]    |   Package biocreative_ppi is already up-to-date!
[nltk_data]    | Downloading package brown to
[nltk_data]    |     C:\Users\lw39km\AppData\Roaming\nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package brown_tei to
[nltk_data]    |     C:\Users\lw39km\AppData\Roaming\nltk_data...
[nltk_data]    |   Package brown_tei is already up-to-date!
[nltk_data]    | Downloading package cess_cat to
[nltk_data]    |     C:\Us

[nltk_data]    |   Package semcor is already up-to-date!
[nltk_data]    | Downloading package senseval to
[nltk_data]    |     C:\Users\lw39km\AppData\Roaming\nltk_data...
[nltk_data]    |   Package senseval is already up-to-date!
[nltk_data]    | Downloading package sentiwordnet to
[nltk_data]    |     C:\Users\lw39km\AppData\Roaming\nltk_data...
[nltk_data]    |   Package sentiwordnet is already up-to-date!
[nltk_data]    | Downloading package sentence_polarity to
[nltk_data]    |     C:\Users\lw39km\AppData\Roaming\nltk_data...
[nltk_data]    |   Package sentence_polarity is already up-to-date!
[nltk_data]    | Downloading package shakespeare to
[nltk_data]    |     C:\Users\lw39km\AppData\Roaming\nltk_data...
[nltk_data]    |   Package shakespeare is already up-to-date!
[nltk_data]    | Downloading package sinica_treebank to
[nltk_data]    |     C:\Users\lw39km\AppData\Roaming\nltk_data...
[nltk_data]    |   Package sinica_treebank is already up-to-date!
[nltk_data]    | Downloadin

True

# Use English stop words and snowball stem

In [43]:
stops = set(stopwords.words("english"))
stemmer = SnowballStemmer('english')

# Run these two cell to make sure you have the data

In [44]:
pd.read_csv("data/cleaned_train.csv").head()

Unnamed: 0,review,terms
0,i charge it at night and skip taking the cord ...,"cord,battery life"
1,i bought a hp pavilion dv4-1222nr laptop and h...,
2,which be the retail shop which i bought my net...,
3,the tech guy then said the service center does...,"service center, sales team,tech guy"
4,i investigated netbooks and saw the toshiba nb...,


In [45]:
pd.read_csv("data/final_train.csv").head()

Unnamed: 0,review,terms
0,i charge it at night and skip taking the cord ...,"O,O,O,O,O,O,O,O,BB,B,EB,O,O,O,BB,BB,B,I"
1,i bought a hp pavilion dv4-1222nr laptop and h...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
2,which be the retail shop which i bought my net...,"O,O,O,O,O,O,O,O,O,O,O"
3,the tech guy then said the service center does...,"BB,B,I,EI,O,BB,B,I,EI,O,O,O,O,O,O,O,O,O,O,O,O,..."
4,i investigated netbooks and saw the toshiba nb...,"O,O,O,O,O,O,O,O"


# Now define feature funtions

# Feature function for single token:

In [58]:
def word2features(sent, i):
    """
    build CRF feature for i th word in sentance
    :param sent: the sentence
    :param i: index of the desired word
    :return: list of strings as features
    """
    #features for current word
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word.stem=' + stemmer.stem(word),
        'word[:1]=' + word[:1],
        'word[:2]=' + word[:2],
        'word[:3]=' + word[:3],
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word[-1:]=' + word[-1:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'word.isstop=%s' % (word in stops),
        'word.length=%s' % len(word),
        #'word.long=%s' % (len(word) >= 5),
        'postag=' + postag,
        'postag[:2]=' + postag[:2]
    ]
    if i > 0:
        #features for previous word
        word1 = sent[i - 1][0]
        postag1 = sent[i - 1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.stem=' + stemmer.stem(word1),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.length=%s' % len(word1),
            '-1:word.isstop=%s' % (word1 in stops),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2],
        ])
    else:
        # no previous, then it's beginning of sentence
        features.append('BOS')

    if i < len(sent) - 1:
        #features for next word
        word1 = sent[i + 1][0]
        postag1 = sent[i + 1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.stem=' + stemmer.stem(word1),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.length=%s' % len(word1),
            '+1:word.isstop=%s' % (word1 in stops),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2],
        ])
    else:
        #if no next, then it's end of sentence
        features.append('EOS')

    return features

# Feature function for one sentence, and label splitor

In [59]:
def sent2features(sent):
    """
    build feature list for sentence
    :param sent: ...
    :return: list of CRF word features
    """
    return [word2features(sent, i) for i in range(len(sent))]


def sent2labels(terms):
    """
    build labels for sentence
    :param terms: the terms for the disired sentence
    :return: 2D list of labels shape is [sentence * word]
    """
    return [label for label in terms.split(',')]

# Function to add frequent terms feature

In [60]:
def add_frequent_terms_to_feature(X, cleaned_fn, top_num=100):
    """
    Add 'if the current word is in the top (top_num) frequent
    aspect terms from training data' in features
    :param X: the 2D list of word features, shape: [sentence * word]
    :param cleaned_fn: the cleaned csv filename
    :param top_num: as name
    :return:
    """
    df = pd.read_csv(cleaned_fn)
    df_train = pd.read_csv("data/cleaned_train.csv")
    terms_list = df_train['terms'].apply(lambda row: re.split("[ ,;:(]", str(row)))

    terms_list = [item for sublist in terms_list for item in sublist]
    frequent_terms = list(dict(collections.Counter(terms_list).most_common(top_num)).keys())


    doc = list(df['review'].apply(lambda review: review.split()))

    for i, sentence_feature in enumerate(X):
        for j, word_feature in enumerate(sentence_feature):
            if (not doc[i][j] in stops) and (doc[i][j] in frequent_terms):
                word_feature.append("{}_FRQ_TERM".format(top_num))

    return X

# Function to generate X and y from data

In [61]:
def constructSemanticInput(cleaned_fn):
    """
    construct semantic inputs for modelling
    :param tagged_fn: tagged csv filename
    :param cleaned_fn: cleaned csv filename
    :return: (X, y)
    """
    df = pd.read_csv(cleaned_fn)
    sentences = df['review'].apply(lambda review: nltk.pos_tag(review.split()))

    X = [sent2features(s) for s in sentences]
    y = [sent2labels(term) for term in df['terms']]

    #add if it is frequent aspect term to feature
    #add_frequent_terms_to_feature(X, cleaned_fn, top_num=200)
    add_frequent_terms_to_feature(X, cleaned_fn)
    add_frequent_terms_to_feature(X, cleaned_fn, top_num=50)
    add_frequent_terms_to_feature(X, cleaned_fn, top_num=10)


    return X, y

# Generate X and y for both training data and test data

In [62]:
X_train, y_train = constructSemanticInput("data/final_train.csv")
X_test, y_test = constructSemanticInput("data/final_test.csv")

# Check what X and y looks like:

In [63]:
print("Features: ", X_train[0][16]) #check the feature and tag for 'battery'
print("Tag: ", y_train[0][16])

Features:  ['bias', 'word.lower=battery', 'word.stem=batteri', 'word[:1]=b', 'word[:2]=ba', 'word[:3]=bat', 'word[-3:]=ery', 'word[-2:]=ry', 'word[-1:]=y', 'word.isupper=False', 'word.istitle=False', 'word.isdigit=False', 'word.isstop=False', 'word.length=7', 'postag=NN', 'postag[:2]=NN', '-1:word.lower=good', '-1:word.stem=good', '-1:word.istitle=False', '-1:word.isupper=False', '-1:word.length=4', '-1:word.isstop=False', '-1:postag=JJ', '-1:postag[:2]=JJ', '+1:word.lower=life', '+1:word.stem=life', '+1:word.istitle=False', '+1:word.isupper=False', '+1:word.length=4', '+1:word.isstop=False', '+1:postag=NN', '+1:postag[:2]=NN', '100_FRQ_TERM', '50_FRQ_TERM', '10_FRQ_TERM']
Tag:  B


# Build the Class for CRF, including fit and predict method, as well as some info output

In [64]:
class CRFClassifier:
    def __init__(self, c1=1.0, c2=1e-3, max_iterations=50):
        """
        make a Classifier class that has similar interface of sklearn for easy benchmark
        :param c1: coefficient for L1 penalty
        :param c2: coefficient for L2 penalty
        :param max_iterations: for early stop
        """
        self.clf = None
        self.params = {
            #'c1': c1,
            #'c2': c2,
            'max_iterations': max_iterations,

            # include transitions that are possible, but not observed
            'feature.possible_transitions': True
        }


    def fit(self, X_train, y_train):
        """
        like sklearn
        :param X_train:
        :param y_train:
        :return:
        """
        trainer = pycrfsuite.Trainer(verbose=True, algorithm='ap')

        for xseq, yseq in zip(X_train, y_train):
            trainer.append(xseq, yseq)

        trainer.set_params(self.params)
        print(trainer.get_params())
        trainer.train('crf_model.crfsuite')

    def predict(self, X_test):
        """
        like sklearn
        :param X_test:
        :return: predictions
        """
        tagger = pycrfsuite.Tagger()
        tagger.open('crf_model.crfsuite')

        y_pred = [tagger.tag(xseq) for xseq in X_test]
        return y_pred

    def learned_transitions(self, top_num=15):
        """
        print and return top (top_num) learned transitions
        :param top_num:
        :return:
        """
        def print_transitions(trans_features):
            for (label_from, label_to), weight in trans_features:
                print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

        tagger = pycrfsuite.Tagger()
        tagger.open('crf_model.crfsuite')

        info = tagger.info()

        print("Top likely transitions:")
        top = Counter(info.transitions).most_common(top_num)
        print_transitions(top)

        print("\nTop unlikely transitions:")
        bottom = Counter(info.transitions).most_common()[-top_num:]
        print_transitions(bottom)

        return top, bottom

    def state_features(self, top_num=20):
        """
        print and return top (top_num) state features
        :param top_num:
        :return:
        """
        def print_state_features(state_features):
            for (attr, label), weight in state_features:
                print("%0.6f %-6s %s" % (weight, label, attr))

        tagger = pycrfsuite.Tagger()
        tagger.open('crf_model.crfsuite')

        info = tagger.info()

        print("Top positive:")
        top = Counter(info.state_features).most_common(top_num)
        print_state_features(top)

        print("\nTop negative:")
        bottom = Counter(info.state_features).most_common()[-top_num:]
        print_state_features(bottom)

        return top, bottom

# Function to print classification report

In [65]:
def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics.
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))

    tagset = set(lb.classes_)
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}

    return metrics.classification_report(
        y_true_combined,
        y_pred_combined,
        labels=[class_indices[cls] for cls in tagset],
        target_names=tagset,
    )

# Fit and Predict

In [66]:
crf = CRFClassifier()
crf.fit(X_train, y_train)
pred = crf.predict(X_test)

{'feature.minfreq': 0.0, 'feature.possible_states': False, 'feature.possible_transitions': True, 'max_iterations': 50, 'epsilon': 0.0}
Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 37560
Seconds required: 0.353

Averaged perceptron
max_iterations: 50
epsilon: 0.000000

***** Iteration #1 *****
Loss: 685.314039
Feature norm: 275.787552
Seconds required for this iteration: 0.080

***** Iteration #2 *****
Loss: 548.485490
Feature norm: 352.017527
Seconds required for this iteration: 0.069

***** Iteration #3 *****
Loss: 484.741735
Feature norm: 412.635814
Seconds required for this iteration: 0.062

***** Iteration #4 *****
Loss: 455.382677
Feature norm: 465.605268
Seconds required for this iteration: 0.065

***** Iteration #5 *****
Loss: 432.565617
Feature norm: 514.104758
Seconds required for this iteration: 0.074

***** Iteration #6 *****
Loss: 3

# Print classification report:

In [67]:
print(bio_classification_report(y_test, pred))

             precision    recall  f1-score   support

          B       0.87      0.67      0.76       652
         BB       0.81      0.60      0.69       547
         EB       0.77      0.61      0.68       266
         EI       0.83      0.53      0.65       171
          I       0.93      0.54      0.68       414
          O       0.92      0.99      0.95      8367

avg / total       0.91      0.91      0.90     10417



# Print Top likely and unlikely transitions that CRF learned:

In [68]:
top_trans, bottom_trans = crf.learned_transitions()

Top likely transitions:
B      -> I       79.619560
BB     -> B       59.729808
I      -> I       58.590410
I      -> EI      56.745375
B      -> EB      54.623753
O      -> O       43.473488
O      -> BB      31.168927
EB     -> O       26.905282
EI     -> O       26.555195
EI     -> BB      18.496297
EB     -> BB      17.585893
O      -> EB      11.661120
O      -> EI      11.302425
BB     -> O       9.638521
EI     -> B       -0.215197

Top unlikely transitions:
EB     -> EI      -15.159377
BB     -> I       -16.963591
BB     -> EI      -17.278464
B      -> EI      -17.288708
I      -> BB      -21.164356
EB     -> I       -21.793924
BB     -> EB      -22.065458
B      -> B       -22.754303
I      -> EB      -26.977708
EI     -> I       -29.416167
O      -> B       -29.775190
B      -> BB      -44.856615
B      -> O       -50.731111
I      -> O       -51.740098
O      -> I       -66.555687


# Print Top likely and unlikely state features that CRF learned:

In [69]:
top_stats, bottom_stats = crf.state_features()

Top positive:
74.848714 B      100_FRQ_TERM
71.263186 I      100_FRQ_TERM
44.337993 I      +1:word.lower=hook
42.984794 I      +1:word.stem=hook
41.628580 O      -1:word.lower=computer
40.942927 B      word[-3:]=ges
39.610473 B      word.stem=surf
38.654594 B      word.isstop=False
38.306210 I      -1:word.lower=easy
38.306210 I      -1:word.stem=easi
38.012759 B      +1:word.lower=have
37.927073 B      word[:2]=iw
37.764206 B      -1:word.stem=use
37.494674 O      -1:word.stem=book
37.185100 BB     +1:word.lower=vista
37.185100 BB     +1:word.stem=vista
37.021972 B      +1:word.stem=have
36.993776 I      +1:word.lower=resolution
36.598559 I      -1:word.lower=virus
36.597041 EB     -1:word.stem=applic

Top negative:
-30.937401 O      word.stem=surf
-31.379464 O      -1:word.stem=size
-31.465494 O      +1:word.lower=shipped
-32.160041 O      +1:word.stem=featur
-32.294557 B      word[-2:]=st
-32.800989 O      +1:word.stem=applic
-33.899546 O      +1:word.stem=connect
-34.574228 B      