In [1]:
import numpy as np
import nltk
import matplotlib.pyplot as plt
import pandas as pd
import json
import pandas as pd
from sklearn.svm import SVC  
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, auc, roc_curve, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import validation_curve
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits import mplot3d
import boto3
from boto.s3.key import Key
import timeit
from io import StringIO

bucket_name = 'masidorov.cs229.ner'

In [38]:
s3 = boto3.resource('s3')
bucket = s3.Bucket(bucket_name)

data_lines = []
for obj in bucket.objects.all():
    key = obj.key
    body = obj.get()['Body'].read().splitlines()
    data_lines = body
print(len(data_lines))

INDEX_ID = 0
INDEX_TAGGED = 1
INDEX_TEXT = 2
INDEX_ANNOTATION = 3

# Annotation constants
ANNOTATION_LABEL = 0
ANNOTATION_TEXT = 1
ANNOTATION_POINTS = 2

ANNOTATION_POINTS_START = 1
ANNOTATION_POINTS_STOP = 2

# structure we do support:
# (tagged, text, annotations)
# annotations = [annotation]
# annotation := (label, text, start, stop)
contents = []

id = 0
for line in data_lines:
    dat = json.loads(line)
    content = dat["content"]
    tuple_annotations = []
    annotations = dat["annotation"]
    tagged = False
    if (annotations is not None):
        tagged = True
        for annotation in annotations:
            label = annotation["label"][0]
            points = annotation["points"]
            tuple_annotation = (label)
            
            tuple_points = []
            for point in points:
                text = point["text"]
                start = point["start"]
                stop = point["end"]
                tuple_points.append((text, start, stop))
            tuple_annotations.append((label,text, tuple_points))
    contents.append((id, tagged, content, tuple_annotations))
    id = id + 1
    
    
#DEDUPLICATION
content2tag = {}
for tuple in contents:
    tagged = tuple[INDEX_TAGGED]
    text = tuple[INDEX_TEXT]
    if text not in content2tag.keys():
        content2tag[text] = tagged
    if (not content2tag[text]) and tagged:
        content2tag[text] = tagged
        
visited = {}
dedupe_context = []
for tuple in contents:
    tagged = tuple[INDEX_TAGGED]
    text = tuple[INDEX_TEXT]
    
    if text in visited.keys():
        continue
    
    if (content2tag[text] == tagged):
        visited[text] = True
        dedupe_context.append(tuple)
        
print("Before dedupe=", len(contents), "After dedupe=", len(dedupe_context))
contents = dedupe_context

8163
Before dedupe= 8163 After dedupe= 3914


In [39]:
# Now we create 2 lists for analysis per each record 
# word list: [w1,w2,...]
# tagging: [tag1, tag2,...] we assume that we have 1 tag per word
# Method we iterate via contents, for each line:
# 1) Extract text and extract intervals
# 2) scan in text words and for each word try to find start match in annotation list

# CONSTANTS WE USE
TUPLE_ID = 0
TUPLE_TAGGED_ID = 1
TUPLE_WORD_LIST = 2
TUPLE_TAG_LIST = 3

# Extraction of the tagged entities
def find_tag(start, stop, annotations):
    for annotation in annotations:
        points = annotation[ANNOTATION_POINTS]
        #print("Annot=", points)
        point = points[0]
        astart = point[ANNOTATION_POINTS_START]
        astop = point[ANNOTATION_POINTS_STOP]
        
        if (start == astart):
            return annotation[ANNOTATION_LABEL]
        
    return ""
    

# This function is doing a data processing
def process_tuple(content):
    id = content[INDEX_ID]
    tagged = content[INDEX_TAGGED]
    text = content[INDEX_TEXT]
    
    words = []
    tags = []
    
    annotations = content[INDEX_ANNOTATION]
    start = 0
    stop = 0
    for i in range(len(text) + 1):
        if (i == len(text)) or text[i] == ' ':
            stop = i
            wlen = stop - start
            
            if wlen > 0:
                tag = find_tag(start, stop, annotations)
                words.append(text[start:stop])
                tags.append(tag)

            start = stop + 1
    
    return (id, tagged, words, tags)


# We investigate the structure of ptuple and ann
# price like "under $300"
# price like "$300 after discount"
# "item under discount" item "for $300"
def augment_tuple(ptuple):
    
    return ptuple

pcontents = [process_tuple(content) for content in contents]



print(len(pcontents))

3914


In [40]:
# "item under discount" item "for $300"
def augment_tuple(ptuple, id):
    tagged = ptuple[INDEX_TAGGED]
    price_found = ('Price' in ptuple[TUPLE_TAG_LIST])
    if price_found or not tagged:
        return [(id, tagged, ptuple[TUPLE_WORD_LIST], ptuple[TUPLE_TAG_LIST])]
    
    res = [(id, tagged, ptuple[TUPLE_WORD_LIST], ptuple[TUPLE_TAG_LIST])]
    id = id + 1
    for i in range(3):
        id = id + 1
        words = ptuple[TUPLE_WORD_LIST]
        tags = ptuple[TUPLE_TAG_LIST]
        words = words + ['under']
        tags = tags + ['']
        
        price = np.random.randint(10, 1000)
        words = words + ['$'+str(price)]
        tags = tags + ['Price']
        res = res + [(id, tagged, words, tags)]
        
    
    return res


# Create training and dev set
ROW_TAGGED = len( [x for x in pcontents if x[INDEX_TAGGED] == True])
tagged_set = [x for x in pcontents if x[INDEX_TAGGED] == True]
train_set_index= np.random.choice(ROW_TAGGED, size=int(ROW_TAGGED*0.95), replace=False)

dev_set_index = list(set(range(ROW_TAGGED)) - set(train_set_index))
train_set = [tagged_set[i] for i in train_set_index]
dev_set = [tagged_set[i] for i in dev_set_index]


train_pcontents = []
id = 1
for pcontent in train_set:
    tuples = augment_tuple(pcontent, id)
    id = id + len(tuples)
    train_pcontents = train_pcontents + tuples


dev_pcontents = []
for pcontent in dev_set:
    tuples = augment_tuple(pcontent, id)
    id = id + len(tuples)
    dev_pcontents = dev_pcontents + tuples
    


#print(len(pcontents),len(ppcontents))
train_set = train_pcontents
dev_set = dev_pcontents

print('Train set size=', len(train_set), ' Dev set size=', len(dev_set))

Train set size= 6564  Dev set size= 346


In [41]:
# Definition of the features extraction

class FeatureExtractor:
    def __init__(self, name):
        self.name = name
        
    # sentense tuple is the result of process_tuple
    # we extract features from tuple and return feature value in format
    # (true/false, FeatureName, FeatureValue)
    def extract(self, word_id, sentense_tupe, addition_mode):
        pass
    
    # Return list of features supported by current features extractor
    def features_list(self):
        return [self.name]

# Check if Hypen is inside
class FE_HyphenInside(FeatureExtractor):
    def __init__(self):
        super(FE_HyphenInside, self).__init__('HyphenInside')
        
    def extract(self, word_id, sentense_tupe, addition_mode):
        word = sentense_tupe[TUPLE_WORD_LIST][word_id]
        return (True, self.name, '-' in word)


# Check if Hypen is inside
class FE_IsNumber(FeatureExtractor):
    def __init__(self):
        super(FE_IsNumber, self).__init__('IsNumber')
        
    def extract(self, word_id, sentense_tupe, addition_mode):
        word = sentense_tupe[TUPLE_WORD_LIST][word_id]
        return (True, self.name, word.isdigit())

# Check started with dollar decimal/ended with dollar
class FE_StartedDollar(FeatureExtractor):
    def __init__(self):
        super(FE_StartedDollar, self).__init__('StartedDollar')
        
    def extract(self, word_id, sentense_tupe, addition_mode):
        word = sentense_tupe[TUPLE_WORD_LIST][word_id]
        dollar =  len(word) > 0 and '$' == word[0] 
        return (True, self.name, dollar)


# Check started digit
class FE_StartedDigit(FeatureExtractor):
    def __init__(self):
        super(FE_StartedDigit, self).__init__('StartedDigit')
        
    def extract(self, word_id, sentense_tupe, addition_mode):
        word = sentense_tupe[TUPLE_WORD_LIST][word_id]
        digit =  len(word) > 0 and '9' >= word[0] and '0' <= word[0] 
        return (True, self.name, digit)

# Check end digit
class FE_EndDigit(FeatureExtractor):
    def __init__(self):
        super(FE_EndDigit, self).__init__('EndDigit')
        
    def extract(self, word_id, sentense_tupe, addition_mode):
        word = sentense_tupe[TUPLE_WORD_LIST][word_id]
        ll = len(word)
        digit =  ll > 0 and '9' >= word[ll-1] and '0' <= word[ll-1] 
        return (True, self.name, digit)


# True if no letters included
class FE_DoesNotHaveLetters(FeatureExtractor):
    def __init__(self):
        super(FE_DoesNotHaveLetters, self).__init__('NoLetters')
        
    def extract(self, word_id, sentense_tupe, addition_mode):
        word = sentense_tupe[TUPLE_WORD_LIST][word_id]
        for c in word:
            if c >= 'a' and c <= 'z':
                return (True, self.name, False)
            if c >= 'A' and c <= 'Z':
                return (True, self.name, False)
            
        return (True, self.name, True)


# Word position
class FE_WordPos(FeatureExtractor):
    def __init__(self):
        super(FE_WordPos, self).__init__('WordPosition')
        
    def extract(self, word_id, sentense_tupe, addition_mode):
        return (True, self.name, word_id)


# and before
class FE_And_Pos_M1(FeatureExtractor):
    def __init__(self):
        super(FE_And_Pos_M1, self).__init__('And_Pos_M1')
        
    def extract(self, word_id, sentense_tupe, addition_mode):
        andBefore = False
        if word_id > 0:
            word = sentense_tupe[TUPLE_WORD_LIST][word_id - 1]
            andBefore = ('and' == word)                        
        return (True, self.name, andBefore)

# and after
class FE_And_Pos_P1(FeatureExtractor):
    def __init__(self):
        super(FE_And_Pos_P1, self).__init__('And_Pos_P1')
        
    def extract(self, word_id, sentense_tupe, addition_mode):
        andAfter = False
        if word_id + 1< len(sentense_tupe[TUPLE_WORD_LIST]):
            word = sentense_tupe[TUPLE_WORD_LIST][word_id + 1]
            andAfter = ('and' == word)                        
        return (True, self.name, andAfter)


# Represent the current word itself
class FE_W0(FeatureExtractor):
    def __init__(self):
        super(FE_W0, self).__init__('W0_')
        self.word2index = {}
        self.current_index = int(0)

    def extract(self, word_id, sentense_tupe, addition_mode):
        word = sentense_tupe[TUPLE_WORD_LIST][word_id]
        if word not in self.word2index.keys():
            if not addition_mode:
                return (False, self.name, 0)
            self.word2index[word] = self.current_index
            self.current_index = int(self.current_index + 1)
        return (True, self.name+str(self.word2index[word]), 1)
    
    def features_list(self):
        return [self.name + str(i) for i in range(self.current_index)]
    


# Class which contains feature extractors we would like to apply
class FeatureExtractionContainer:
    def __init__(self):
        self.feature_extractors = [FE_HyphenInside(), FE_IsNumber(), 
                                   FE_W0(), FE_StartedDollar(), FE_DoesNotHaveLetters(),
                                   FE_StartedDigit(), FE_EndDigit(), FE_WordPos(),
                                   FE_And_Pos_M1(), FE_And_Pos_P1()]

    # This is extraction from one specific tuple
    def extract_from_tuple(self, word_id, sentense_tuple, addition_mode):
        features = {}
        for fe in self.feature_extractors:
            fe_result = fe.extract(word_id, sentense_tuple, addition_mode)
            if fe_result[0]:
                features[fe_result[1]] = fe_result[2]
        return features
    
    # result is a list of tuples which will include
    # word_features_list := (tuple_id, IS_TAGGED, word, word_id, tag, features)
    def process_sentense(self, sentense_tuple, addition_mode):
        result = []
        tuple_id = sentense_tuple[TUPLE_ID]
        tagged = sentense_tuple[TUPLE_TAGGED_ID]
        
        
        for word_id in range(len(sentense_tuple[TUPLE_WORD_LIST])):
            features = self.extract_from_tuple(word_id, sentense_tuple, addition_mode)
            tag = sentense_tuple[TUPLE_TAG_LIST][word_id]
            result.append( (tuple_id, tagged, sentense_tuple[TUPLE_WORD_LIST][word_id], word_id, tag, features) )
            
        return result

    # result is a list of tuples which will include
    # word_features_list := (tuple_id, IS_TAGGED, word, word_id, tag, features)
    def process_sentenses(self, sentense_tuples, addition_mode=True):
        result = []
        for sentense in sentense_tuples:
            result = result + self.process_sentense(sentense, addition_mode)
            
        return result
        
    # This function takes list of tuples produced by
    # process_sentense and put it in nice pandas.DataFrame
    def features_pandalizer(self, word_features_list):
        features_vector = {}
        features_vector['TupleID'] = []
        features_vector['Tagged'] = []
        features_vector['Tag'] = []
        features_vector['word'] = []
        features_vector['WordID'] = []
        
        # Inject features
        for fe in self.feature_extractors:
            for fname in fe.features_list():
                features_vector[fname] = []
        
        # Phase of creating long lists
        for word_features in word_features_list:
            features_vector['TupleID'].append(word_features[0])
            features_vector['Tagged'].append(word_features[1])
            features_vector['Tag'].append(word_features[4])
            features_vector['word'].append(word_features[2])
            features_vector['WordID'].append(word_features[3])
            
            # working with features
            for fe in self.feature_extractors:
                for fname in fe.features_list():
                    if fname in word_features[5].keys():
                        #print('FName=', fname, ' Value=', word_features[5][fname])
                        features_vector[fname].append(int(word_features[5][fname]))
                    else:
                        features_vector[fname].append( 0)
                        
            #print(features_vector)
                        
                        
            
        
        df = pd.DataFrame(features_vector)
        return df
    
    
    
        
#DEBUGGING
print(pcontents[0])
fec= FeatureExtractionContainer()
#all_features= fec.process_sentenses(pcontents)
#print(all_features)
#df = fec.features_pandalizer(all_features)
#print(df.head())


(0, True, ['apple', 'watch'], ['Brand', 'Category'])


In [42]:
# do split traint/dev set
# count of rows which
start_time = timeit.default_timer()

fex = FE_W0()
fec= FeatureExtractionContainer()

#print(pcontents[101], fex.extract(0, pcontents[101], False))

#tid = 102
#print(pcontents[tid])

#all_features= fec.process_sentenses(pcontents[101:102], False)

#df = fec.features_pandalizer(all_features)
#print(df.head())

# Arrange trainig and test set



fec= FeatureExtractionContainer()
train_set_feature= fec.process_sentenses(train_set)
dev_set_feature= fec.process_sentenses(dev_set, False)



df_train = fec.features_pandalizer(train_set_feature)

df_dev = fec.features_pandalizer(dev_set_feature)

elapsed = timeit.default_timer() - start_time
print('Elpased time=', elapsed)

Elpased time= 43.66325727599906


In [151]:
# Training set
start_time = timeit.default_timer()

all_labels = ['Brand', 'Category', 'ModelName', 'Price']
cls_per_ner = {}

df_train_result = df_train.loc[:, ['TupleID', 'word','Tag']].copy()
df_train_result = df_train_result.reindex(columns=df_train_result.columns.tolist() + all_labels)

for label in all_labels:
    #df.head()
    df_train_copy = df_train.loc[:].copy()
    df_train_copy['Y'] = df_train_copy.apply(lambda row: (1 if row['Tag'] == label else 0), axis=1)

    df_train_copy.drop(['word', 'WordID', 'Tagged', 'TupleID', 'Tag'], axis=1, inplace=True)
    #df1.head()

    X_train = df_train_copy.drop('Y', axis=1)
    Y_train = df_train_copy['Y']

    svc = SVC(kernel='rbf', gamma=0.7, probability=True)
    clf = svc.fit(X_train, Y_train)
    y_pred = clf.predict(X_train)
    y_pred_porbability = clf.predict_proba(X_train)
    df_train_result[label] = y_pred_porbability[:,1]
    cls_per_ner[label] = clf
    print("Accuracy:",metrics.accuracy_score(Y_train, y_pred), metrics.precision_score(Y_train, y_pred))
    print(confusion_matrix(Y_train, y_pred))

    print(classification_report(Y_train, y_pred))


#param_grid = {'C':[0.6, 0.8,1, 1.2, 2], 'gamma': [3, 2.5,2, 1.8, 1.6]}
#model_search = GridSearchCV(SVC(probability=True), param_grid, verbose=2)

elapsed = timeit.default_timer() - start_time
print('Elpased time=', elapsed)

csv_buffer = StringIO()
df_train_result.to_csv(csv_buffer)
s3.Object(bucket_name, 'train_svm_result.csv').put(Body=csv_buffer.getvalue())


#DEV set

df_dev_result = df_dev.loc[:, ['TupleID', 'word','Tag']].copy()
df_dev_result = df_dev_result.reindex(columns=df_dev_result.columns.tolist() + all_labels)

for label in all_labels:
    clf = cls_per_ner[label]

    df_dev_copy = df_dev.copy()
    df_dev_copy['Y'] = df_dev_copy.apply(lambda row: (1 if row['Tag'] == label else 0), axis=1)

    df_dev_copy.drop(['word', 'WordID', 'Tagged', 'TupleID', 'Tag'], axis=1, inplace=True)
    #df1.head()

    X_dev = df_dev_copy.drop('Y', axis=1)
    Y_dev = df_dev_copy['Y']

    y_pred = clf.predict(X_dev)
    y_pred_porbability = clf.predict_proba(X_dev)
    df_dev_result[label] = y_pred_porbability[:,1]

    print("Accuracy:",metrics.accuracy_score(Y_dev, y_pred), "Detemined brands=", np.sum(y_pred), np.sum(Y_dev))
    print("Accuracy:",metrics.accuracy_score(Y_dev, y_pred), metrics.recall_score(Y_dev, y_pred))

    print(confusion_matrix(Y_dev, y_pred, [0, 1]))

    print(classification_report(Y_dev, y_pred))

csv_buffer = StringIO()
df_dev_result.to_csv(csv_buffer)
s3.Object(bucket_name, 'dev_svm_result.csv').put(Body=csv_buffer.getvalue())


Accuracy: 0.9771061908345546 0.8974245115452931
[[22648   231]
 [  347  2021]]
             precision    recall  f1-score   support

          0       0.98      0.99      0.99     22879
          1       0.90      0.85      0.87      2368

avg / total       0.98      0.98      0.98     25247

Accuracy: 0.9519546876856656 0.887444041782136
[[19871   528]
 [  685  4163]]
             precision    recall  f1-score   support

          0       0.97      0.97      0.97     20399
          1       0.89      0.86      0.87      4848

avg / total       0.95      0.95      0.95     25247

Accuracy: 0.9676001109042659 0.9103568320278503
[[21291   309]
 [  509  3138]]
             precision    recall  f1-score   support

          0       0.98      0.99      0.98     21600
          1       0.91      0.86      0.88      3647

avg / total       0.97      0.97      0.97     25247

Accuracy: 0.9987721313423377 1.0
[[20314     0]
 [   31  4902]]
             precision    recall  f1-score   support

 

{'ResponseMetadata': {'RequestId': '6B9E8906846F0EED',
  'HostId': 'RES5OspEv7IT1VTBAv4Z4S2af3Z1YF0DlRymvJYqkabe9QnQOl6nIF62i9onol1qKH+6pmlIw1s=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'RES5OspEv7IT1VTBAv4Z4S2af3Z1YF0DlRymvJYqkabe9QnQOl6nIF62i9onol1qKH+6pmlIw1s=',
   'x-amz-request-id': '6B9E8906846F0EED',
   'date': 'Mon, 03 Dec 2018 13:55:19 GMT',
   'etag': '"ba8223d8277ace25d6545d12455e8662"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"ba8223d8277ace25d6545d12455e8662"'}

In [152]:
#DEV set

df_dev_result = df_dev.loc[:, ['TupleID', 'word','Tag']].copy()
df_dev_result = df_dev_result.reindex(columns=df_dev_result.columns.tolist() + all_labels)

for label in all_labels:
    clf = cls_per_ner[label]

    df_dev_copy = df_dev.copy()
    df_dev_copy['Y'] = df_dev_copy.apply(lambda row: (1 if row['Tag'] == label else 0), axis=1)

    df_dev_copy.drop(['word', 'WordID', 'Tagged', 'TupleID', 'Tag'], axis=1, inplace=True)
    #df1.head()

    X_dev = df_dev_copy.drop('Y', axis=1)
    Y_dev = df_dev_copy['Y']

    y_pred = clf.predict(X_dev)
    y_pred_porbability = clf.predict_proba(X_dev)
    df_dev_result[label] = y_pred_porbability[:,1]

    print('==========================')
    print('Label=', label)
    print('==========================')
    print("Accuracy:",metrics.accuracy_score(Y_dev, y_pred), "Detemined brands=", np.sum(y_pred), np.sum(Y_dev))
    print("Accuracy:",metrics.accuracy_score(Y_dev, y_pred), metrics.recall_score(Y_dev, y_pred))

    print(confusion_matrix(Y_dev, y_pred, [0, 1]))

    print(classification_report(Y_dev, y_pred))

#csv_buffer = StringIO()
#df_dev_result.to_csv(csv_buffer)
#s3.Object(bucket_name, 'dev_svm_result.csv').put(Body=csv_buffer.getvalue())
    
#false_positive_rate, true_positive_rate, thresholds= metrics.roc_curve(Y_dev, y_pred, pos_label=1)
#roc_auc = auc(false_positive_rate, true_positive_rate)

#plt.title('Receiver Operating Characteristic')
#plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f'% roc_auc)
#plt.plot([0,1],[0,1],'r--')
#plt.xlim([-0.1,1.2])
#plt.ylim([-0.1,1.2])
#plt.ylabel('True Positive Rate')
#plt.xlabel('False Positive Rate')
#plt.show()

Label= Brand
Accuracy: 0.9311355311355312 Detemined brands= 110 144
Accuracy: 0.9311355311355312 0.5555555555555556
[[1191   30]
 [  64   80]]
             precision    recall  f1-score   support

          0       0.95      0.98      0.96      1221
          1       0.73      0.56      0.63       144

avg / total       0.93      0.93      0.93      1365

Label= Category
Accuracy: 0.8388278388278388 Detemined brands= 285 249
Accuracy: 0.8388278388278388 0.6305220883534136
[[988 128]
 [ 92 157]]
             precision    recall  f1-score   support

          0       0.91      0.89      0.90      1116
          1       0.55      0.63      0.59       249

avg / total       0.85      0.84      0.84      1365

Label= ModelName
Accuracy: 0.8952380952380953 Detemined brands= 127 176
Accuracy: 0.8952380952380953 0.45454545454545453
[[1142   47]
 [  96   80]]
             precision    recall  f1-score   support

          0       0.92      0.96      0.94      1189
          1       0.63      0.

In [91]:
#df_train_result[label] = y_pred_porbability[:,1]
#len(y_pred_porbability[:,1])
#df_train_copy = df_train[:300].copy()
#len(df_train_copy['word'])
#df_train_result[label] = y_pred_porbability[:,1]
#df_train_copy['Price'] = y_pred_porbability[:,1]
df_train_copy['word']

0      0.007623
1      0.007611
2      0.007606
3      0.007623
4      0.007611
5      0.007606
6      0.007320
7      0.007627
8      0.007623
9      0.007611
10     0.007606
11     0.007320
12     0.007619
13     0.007623
14     0.007611
15     0.007606
16     0.007320
17     0.007628
18     0.007617
19     0.007604
20     0.007625
21     0.007612
22     0.007617
23     0.007604
24     0.007625
25     0.007612
26     0.007622
27     0.007606
28     0.007617
29     0.007604
         ...   
270    0.007620
271    0.007617
272    0.996960
273    0.007625
274    0.007624
275    0.007622
276    0.007603
277    0.007641
278    0.007623
279    0.007641
280    0.007623
281    0.004581
282    0.007614
283    0.007641
284    0.007623
285    0.004581
286    0.007625
287    0.007641
288    0.007623
289    0.004581
290    0.007625
291    0.996969
292    0.007630
293    0.996969
294    0.007630
295    0.004581
296    0.007616
297    0.996969
298    0.007630
299    0.004581
Name: word, Length: 300,

In [127]:
df_train_result['Price'] = y_pred_porbability[:,1]

In [148]:
df_dev_result

Unnamed: 0,TupleID,word,Tag,Brand,Category,ModelName,Price
0,6565,sony,Brand,0.100981,0.023523,0.031562,0.003912
1,6565,xb50bs,ModelName,0.015399,0.185921,0.051500,0.003376
2,6565,extra,,0.005847,0.296129,0.079183,0.005387
3,6565,bass,,0.004986,0.955561,0.010156,0.000840
4,6567,sony,Brand,0.100981,0.023523,0.031562,0.003912
5,6567,xb50bs,ModelName,0.015399,0.185921,0.051500,0.003376
6,6567,extra,,0.005847,0.296129,0.079183,0.005387
7,6567,bass,,0.004986,0.955561,0.010156,0.000840
8,6567,under,,0.005847,0.004300,0.005471,0.002627
9,6567,$858,Price,0.006237,0.005815,0.007337,0.977309
