In [1]:
# Implementation of Gradient boosting trees

import numpy as np
import nltk
import matplotlib.pyplot as plt
import pandas as pd
import json
import pandas as pd
from sklearn.svm import SVC  
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, auc, roc_curve, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import validation_curve
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits import mplot3d
import boto3
from boto.s3.key import Key
import timeit
from io import StringIO
from sklearn.ensemble import GradientBoostingClassifier 
import seaborn as sns

bucket_name = 'masidorov.cs229.ner'
data_file_name = 'all-items-2sets.json'

#Usage: https://rcarneva.github.io/understanding-gradient-boosting-part-1.html


In [2]:
s3 = boto3.resource('s3')
bucket = s3.Bucket(bucket_name)

# See bucket API: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Bucket.Object

data_lines = []
obj = bucket.Object(data_file_name)
body = obj.get()['Body'].read().splitlines()
data_lines = body
print(len(data_lines))

INDEX_ID = 0
INDEX_TAGGED = 1
INDEX_TEXT = 2
INDEX_ANNOTATION = 3

# Annotation constants
ANNOTATION_LABEL = 0
ANNOTATION_TEXT = 1
ANNOTATION_POINTS = 2

ANNOTATION_POINTS_START = 1
ANNOTATION_POINTS_STOP = 2

# structure we do support:
# (tagged, text, annotations)
# annotations = [annotation]
# annotation := (label, text, start, stop)
contents = []

id = 0
for line in data_lines:
    dat = json.loads(line)
    content = dat["content"]
    tuple_annotations = []
    annotations = dat["annotation"]
    tagged = False
    if (annotations is not None):
        tagged = True
        for annotation in annotations:
            label = annotation["label"][0]
            points = annotation["points"]
            tuple_annotation = (label)
            
            tuple_points = []
            for point in points:
                text = point["text"]
                start = point["start"]
                stop = point["end"]
                tuple_points.append((text, start, stop))
            tuple_annotations.append((label,text, tuple_points))
    contents.append((id, tagged, content, tuple_annotations))
    id = id + 1
    
    
#DEDUPLICATION
content2tag = {}
for tuple in contents:
    tagged = tuple[INDEX_TAGGED]
    text = tuple[INDEX_TEXT]
    if text not in content2tag.keys():
        content2tag[text] = tagged
    if (not content2tag[text]) and tagged:
        content2tag[text] = tagged
        
visited = {}
dedupe_context = []
for tuple in contents:
    tagged = tuple[INDEX_TAGGED]
    text = tuple[INDEX_TEXT]
    
    if text in visited.keys():
        continue
    
    if (content2tag[text] == tagged):
        visited[text] = True
        dedupe_context.append(tuple)
        
print("Before dedupe=", len(contents), "After dedupe=", len(dedupe_context))
contents = dedupe_context

8163
Before dedupe= 8163 After dedupe= 3914


In [3]:
# Now we create 2 lists for analysis per each record 
# word list: [w1,w2,...]
# tagging: [tag1, tag2,...] we assume that we have 1 tag per word
# Method we iterate via contents, for each line:
# 1) Extract text and extract intervals
# 2) scan in text words and for each word try to find start match in annotation list

# CONSTANTS WE USE
TUPLE_ID = 0
TUPLE_TAGGED_ID = 1
TUPLE_WORD_LIST = 2
TUPLE_TAG_LIST = 3

# Extraction of the tagged entities
def find_tag(start, stop, annotations):
    for annotation in annotations:
        points = annotation[ANNOTATION_POINTS]
        #print("Annot=", points)
        point = points[0]
        astart = point[ANNOTATION_POINTS_START]
        astop = point[ANNOTATION_POINTS_STOP]
        
        if (start == astart):
            return annotation[ANNOTATION_LABEL]
        
    return ""
    

# This function is doing a data processing
def process_tuple(content):
    id = content[INDEX_ID]
    tagged = content[INDEX_TAGGED]
    text = content[INDEX_TEXT]
    
    words = []
    tags = []
    
    annotations = content[INDEX_ANNOTATION]
    start = 0
    stop = 0
    for i in range(len(text) + 1):
        if (i == len(text)) or text[i] == ' ':
            stop = i
            wlen = stop - start
            
            if wlen > 0:
                tag = find_tag(start, stop, annotations)
                words.append(text[start:stop])
                tags.append(tag)

            start = stop + 1
    
    return (id, tagged, words, tags)


# We investigate the structure of ptuple and ann
# price like "under $300"
# price like "$300 after discount"
# "item under discount" item "for $300"
def augment_tuple(ptuple):
    
    return ptuple

pcontents = [process_tuple(content) for content in contents]



print(len(pcontents))

3914


In [6]:
# "item under discount" item "for $300"
def augment_tuple(ptuple, id):
    tagged = ptuple[INDEX_TAGGED]
    price_found = ('Price' in ptuple[TUPLE_TAG_LIST])
    if price_found or not tagged:
        return [(id, tagged, ptuple[TUPLE_WORD_LIST], ptuple[TUPLE_TAG_LIST])]
    
    res = [(id, tagged, ptuple[TUPLE_WORD_LIST], ptuple[TUPLE_TAG_LIST])]
    id = id + 1
    for i in range(3):
        id = id + 1
        words = ptuple[TUPLE_WORD_LIST]
        tags = ptuple[TUPLE_TAG_LIST]
        words = words + ['under']
        tags = tags + ['']
        
        price = np.random.randint(10, 1000)
        words = words + ['$'+str(price)]
        tags = tags + ['Price']
        res = res + [(id, tagged, words, tags)]
        
    
    return res


# Create training and dev set
ROW_TAGGED = len( [x for x in pcontents if x[INDEX_TAGGED] == True])
tagged_set = [x for x in pcontents if x[INDEX_TAGGED] == True]
train_set_index= np.random.choice(ROW_TAGGED, size=int(ROW_TAGGED*0.95), replace=False)

dev_set_index = list(set(range(ROW_TAGGED)) - set(train_set_index))
train_set = [tagged_set[i] for i in train_set_index]
dev_set = [tagged_set[i] for i in dev_set_index]


train_pcontents = []
id = 1
for pcontent in train_set:
    tuples = augment_tuple(pcontent, id)
    id = id + len(tuples)
    train_pcontents = train_pcontents + tuples


dev_pcontents = []
for pcontent in dev_set:
    tuples = augment_tuple(pcontent, id)
    id = id + len(tuples)
    dev_pcontents = dev_pcontents + tuples
    


#print(len(pcontents),len(ppcontents))
train_set = train_pcontents
dev_set = dev_pcontents

print('Train set size=', len(train_set), ' Dev set size=', len(dev_set))

Train set size= 6561  Dev set size= 349


In [7]:
# Definition of the features extraction

class FeatureExtractor:
    def __init__(self, name):
        self.name = name
        
    # sentense tuple is the result of process_tuple
    # we extract features from tuple and return feature value in format
    # (true/false, FeatureName, FeatureValue)
    def extract(self, word_id, sentense_tupe, addition_mode):
        pass
    
    # Return list of features supported by current features extractor
    def features_list(self):
        return [self.name]

# Check if Hypen is inside
class FE_HyphenInside(FeatureExtractor):
    def __init__(self):
        super(FE_HyphenInside, self).__init__('HyphenInside')
        
    def extract(self, word_id, sentense_tupe, addition_mode):
        word = sentense_tupe[TUPLE_WORD_LIST][word_id]
        return (True, self.name, '-' in word)


# Check if Hypen is inside
class FE_IsNumber(FeatureExtractor):
    def __init__(self):
        super(FE_IsNumber, self).__init__('IsNumber')
        
    def extract(self, word_id, sentense_tupe, addition_mode):
        word = sentense_tupe[TUPLE_WORD_LIST][word_id]
        return (True, self.name, word.isdigit())

# Check started with dollar decimal/ended with dollar
class FE_StartedDollar(FeatureExtractor):
    def __init__(self):
        super(FE_StartedDollar, self).__init__('StartedDollar')
        
    def extract(self, word_id, sentense_tupe, addition_mode):
        word = sentense_tupe[TUPLE_WORD_LIST][word_id]
        dollar =  len(word) > 0 and '$' == word[0] 
        return (True, self.name, dollar)


# Check started digit
class FE_StartedDigit(FeatureExtractor):
    def __init__(self):
        super(FE_StartedDigit, self).__init__('StartedDigit')
        
    def extract(self, word_id, sentense_tupe, addition_mode):
        word = sentense_tupe[TUPLE_WORD_LIST][word_id]
        digit =  len(word) > 0 and '9' >= word[0] and '0' <= word[0] 
        return (True, self.name, digit)

# Check end digit
class FE_EndDigit(FeatureExtractor):
    def __init__(self):
        super(FE_EndDigit, self).__init__('EndDigit')
        
    def extract(self, word_id, sentense_tupe, addition_mode):
        word = sentense_tupe[TUPLE_WORD_LIST][word_id]
        ll = len(word)
        digit =  ll > 0 and '9' >= word[ll-1] and '0' <= word[ll-1] 
        return (True, self.name, digit)


# True if no letters included
class FE_DoesNotHaveLetters(FeatureExtractor):
    def __init__(self):
        super(FE_DoesNotHaveLetters, self).__init__('NoLetters')
        
    def extract(self, word_id, sentense_tupe, addition_mode):
        word = sentense_tupe[TUPLE_WORD_LIST][word_id]
        for c in word:
            if c >= 'a' and c <= 'z':
                return (True, self.name, False)
            if c >= 'A' and c <= 'Z':
                return (True, self.name, False)
            
        return (True, self.name, True)


# Word position
class FE_WordPos(FeatureExtractor):
    def __init__(self):
        super(FE_WordPos, self).__init__('WordPosition')
        
    def extract(self, word_id, sentense_tupe, addition_mode):
        return (True, self.name, word_id)


# and before
class FE_And_Pos_M1(FeatureExtractor):
    def __init__(self):
        super(FE_And_Pos_M1, self).__init__('And_Pos_M1')
        
    def extract(self, word_id, sentense_tupe, addition_mode):
        andBefore = False
        if word_id > 0:
            word = sentense_tupe[TUPLE_WORD_LIST][word_id - 1]
            andBefore = ('and' == word)                        
        return (True, self.name, andBefore)

# and after
class FE_And_Pos_P1(FeatureExtractor):
    def __init__(self):
        super(FE_And_Pos_P1, self).__init__('And_Pos_P1')
        
    def extract(self, word_id, sentense_tupe, addition_mode):
        andAfter = False
        if word_id + 1< len(sentense_tupe[TUPLE_WORD_LIST]):
            word = sentense_tupe[TUPLE_WORD_LIST][word_id + 1]
            andAfter = ('and' == word)                        
        return (True, self.name, andAfter)


# Represent the current word itself
class FE_W0(FeatureExtractor):
    def __init__(self):
        super(FE_W0, self).__init__('W0_')
        self.word2index = {}
        self.current_index = int(0)

    def extract(self, word_id, sentense_tupe, addition_mode):
        word = sentense_tupe[TUPLE_WORD_LIST][word_id]
        if word not in self.word2index.keys():
            if not addition_mode:
                return (False, self.name, 0)
            self.word2index[word] = self.current_index
            self.current_index = int(self.current_index + 1)
        return (True, self.name+str(self.word2index[word]), 1)
    
    def features_list(self):
        return [self.name + str(i) for i in range(self.current_index)]
    


# Class which contains feature extractors we would like to apply
class FeatureExtractionContainer:
    def __init__(self):
        self.feature_extractors = [FE_HyphenInside(), FE_IsNumber(), 
                                   FE_W0(), FE_StartedDollar(), FE_DoesNotHaveLetters(),
                                   FE_StartedDigit(), FE_EndDigit(), FE_WordPos(),
                                   FE_And_Pos_M1(), FE_And_Pos_P1()]

    # This is extraction from one specific tuple
    def extract_from_tuple(self, word_id, sentense_tuple, addition_mode):
        features = {}
        for fe in self.feature_extractors:
            fe_result = fe.extract(word_id, sentense_tuple, addition_mode)
            if fe_result[0]:
                features[fe_result[1]] = fe_result[2]
        return features
    
    # result is a list of tuples which will include
    # word_features_list := (tuple_id, IS_TAGGED, word, word_id, tag, features)
    def process_sentense(self, sentense_tuple, addition_mode):
        result = []
        tuple_id = sentense_tuple[TUPLE_ID]
        tagged = sentense_tuple[TUPLE_TAGGED_ID]
        
        
        for word_id in range(len(sentense_tuple[TUPLE_WORD_LIST])):
            features = self.extract_from_tuple(word_id, sentense_tuple, addition_mode)
            tag = sentense_tuple[TUPLE_TAG_LIST][word_id]
            result.append( (tuple_id, tagged, sentense_tuple[TUPLE_WORD_LIST][word_id], word_id, tag, features) )
            
        return result

    # result is a list of tuples which will include
    # word_features_list := (tuple_id, IS_TAGGED, word, word_id, tag, features)
    def process_sentenses(self, sentense_tuples, addition_mode=True):
        result = []
        for sentense in sentense_tuples:
            result = result + self.process_sentense(sentense, addition_mode)
            
        return result
        
    # This function takes list of tuples produced by
    # process_sentense and put it in nice pandas.DataFrame
    def features_pandalizer(self, word_features_list):
        features_vector = {}
        features_vector['TupleID'] = []
        features_vector['Tagged'] = []
        features_vector['Tag'] = []
        features_vector['word'] = []
        features_vector['WordID'] = []
        
        # Inject features
        for fe in self.feature_extractors:
            for fname in fe.features_list():
                features_vector[fname] = []
        
        # Phase of creating long lists
        for word_features in word_features_list:
            features_vector['TupleID'].append(word_features[0])
            features_vector['Tagged'].append(word_features[1])
            features_vector['Tag'].append(word_features[4])
            features_vector['word'].append(word_features[2])
            features_vector['WordID'].append(word_features[3])
            
            # working with features
            for fe in self.feature_extractors:
                for fname in fe.features_list():
                    if fname in word_features[5].keys():
                        #print('FName=', fname, ' Value=', word_features[5][fname])
                        features_vector[fname].append(int(word_features[5][fname]))
                    else:
                        features_vector[fname].append( 0)
                        
            #print(features_vector)
                        
                        
            
        
        df = pd.DataFrame(features_vector)
        return df
    
    
    
        
#DEBUGGING
print(pcontents[0])
fec= FeatureExtractionContainer()
#all_features= fec.process_sentenses(pcontents)
#print(all_features)
#df = fec.features_pandalizer(all_features)
#print(df.head())


(0, True, ['apple', 'watch'], ['Brand', 'Category'])


In [None]:
%%time
# do split traint/dev set
# count of rows which

fex = FE_W0()
fec= FeatureExtractionContainer()

#print(pcontents[101], fex.extract(0, pcontents[101], False))

#tid = 102
#print(pcontents[tid])

#all_features= fec.process_sentenses(pcontents[101:102], False)

#df = fec.features_pandalizer(all_features)
#print(df.head())

# Arrange trainig and test set



fec= FeatureExtractionContainer()
train_set_feature= fec.process_sentenses(train_set)
dev_set_feature= fec.process_sentenses(dev_set, False)



df_train = fec.features_pandalizer(train_set_feature)

df_dev = fec.features_pandalizer(dev_set_feature)



In [23]:
start_time = timeit.default_timer()

all_labels = ['Brand', 'Category', 'ModelName', 'Price']
cls_per_ner = {}

df_train_result = df_train.loc[:, ['TupleID', 'word','Tag']].copy()
df_train_result = df_train_result.reindex(columns=df_train_result.columns.tolist() + all_labels)

#GBM parameters


stage_preds = {}
final_preds = {}

depth = 3
lr = 0.5
estimators = [300,400,500,600,700,800]
est = 500

#all_labels = [ 'Category']
for label in all_labels:
    #df.head()
    print('Start label = ', label)
    common_args2 = {'max_depth': depth, 'n_estimators': est, 'subsample': 0.9, 'random_state': 2}
    model  = GradientBoostingClassifier(learning_rate=lr, **common_args2)

    df_train_copy = df_train.loc[:].copy()
    df_train_copy['Y'] = df_train_copy.apply(lambda row: (1 if row['Tag'] == label else 0), axis=1)

    df_train_copy.drop(['word', 'WordID', 'Tagged', 'TupleID', 'Tag'], axis=1, inplace=True)
    #df1.head()

    X_train = df_train_copy.drop('Y', axis=1)
    Y_train = df_train_copy['Y']

    #svc = SVC(kernel='rbf', gamma=0.7, probability=True)
    clf = model.fit(X_train, Y_train)
    cls_per_ner[label] = clf
          
    y_pred = clf.predict(X_train)
    y_pred_porbability = clf.predict_proba(X_train)
    df_train_result[label] = y_pred_porbability[:,1]
    cls_per_ner[label] = clf

    print("Accuracy:",metrics.accuracy_score(Y_train, y_pred), metrics.precision_score(Y_train, y_pred))
    print(confusion_matrix(Y_train, y_pred))

    print(classification_report(Y_train, y_pred))

elapsed = timeit.default_timer() - start_time
print('Elpased time=', elapsed)

csv_buffer = StringIO()
df_train_result.to_csv(csv_buffer)
s3.Object(bucket_name, 'train_gbt_result.csv').put(Body=csv_buffer.getvalue())


#DEV set

df_dev_result = df_dev.loc[:, ['TupleID', 'word','Tag']].copy()
df_dev_result = df_dev_result.reindex(columns=df_dev_result.columns.tolist() + all_labels)
          
for label in all_labels:
              # Training error
    df_dev_copy = df_dev.copy()
    df_dev_copy['Y'] = df_dev_copy.apply(lambda row: (1 if row['Tag'] == label else 0), axis=1)
    df_dev_copy.drop(['word', 'WordID', 'Tagged', 'TupleID', 'Tag'], axis=1, inplace=True)

    X_dev = df_dev_copy.drop('Y', axis=1)
    Y_dev = df_dev_copy['Y']

    clf = cls_per_ner[label]
    y_pred = clf.predict(X_dev)
    y_pred_porbability = clf.predict_proba(X_dev)
    df_dev_result[label] = y_pred_porbability[:,1]          

    print('==========================')
    print('Label=', label)
    print('==========================')
    print("Accuracy:",metrics.accuracy_score(Y_dev, y_pred), "Detemined brands=", np.sum(y_pred), np.sum(Y_dev))
    print("Accuracy:",metrics.accuracy_score(Y_dev, y_pred), metrics.recall_score(Y_dev, y_pred))

csv_buffer = StringIO()
df_dev_result.to_csv(csv_buffer)
s3.Object(bucket_name, 'dev_gbm_result.csv').put(Body=csv_buffer.getvalue())
          
elapsed = timeit.default_timer() - start_time
print('Elpased time=', elapsed)

Start label =  Brand
Accuracy: 0.9730615791760477 0.8962616822429906
[[22717   222]
 [  460  1918]]
             precision    recall  f1-score   support

          0       0.98      0.99      0.99     22939
          1       0.90      0.81      0.85      2378

avg / total       0.97      0.97      0.97     25317

Start label =  Category
Accuracy: 0.946676146462851 0.8826742407690628
[[19927   537]
 [  813  4040]]
             precision    recall  f1-score   support

          0       0.96      0.97      0.97     20464
          1       0.88      0.83      0.86      4853

avg / total       0.95      0.95      0.95     25317

Start label =  ModelName
Accuracy: 0.9645692617608721 0.8928571428571429
[[21345   369]
 [  528  3075]]
             precision    recall  f1-score   support

          0       0.98      0.98      0.98     21714
          1       0.89      0.85      0.87      3603

avg / total       0.96      0.96      0.96     25317

Start label =  Price
Accuracy: 0.9989730220800253

In [21]:
df_dev_result[:100]


Unnamed: 0,TupleID,word,Tag,Category
0,6571,apple,Brand,0.084158
1,6571,watch,Category,0.803335
2,6571,series,ModelName,0.105771
3,6571,3,ModelName,0.004829
4,6573,apple,Brand,0.084158
5,6573,watch,Category,0.803335
6,6573,series,ModelName,0.105771
7,6573,3,ModelName,0.004829
8,6573,under,,0.007069
9,6573,$431,Price,0.004829
