In [1]:
import glob
import pandas as pd
import numpy as np
import json
import string, re
from gensim.models import Word2Vec
import logging
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, Embedding, Activation, Flatten
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import confusion_matrix, f1_score, recall_score
from sklearn.model_selection import train_test_split


UNK_INDEX = 0
UNK_TOKEN = 'UNK'

def remove_punct(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct

def add_new_word(new_word, new_vector, new_index, embedding_matrix, word2id):
    embedding_matrix = np.insert(embedding_matrix, [new_index], [new_vector], axis=0)
    
    word2id = {word: (index+1) if index >= new_index else index for word, index in word2id.items()}
    word2id[new_word] = new_index
    return embedding_matrix, word2id

def get_int_data(token_text, word2id):
    x = []
    unk_count = 0
    for item in token_text:
        temp=[]
        x.append(temp)
        for word in item:
            if word in word2id:
                temp.append(word2id.get(word))
            else:
                temp.append(UNK_INDEX)
                unk_count += 1
    print('Data created. Percentage of unknown words: %.3f' % (unk_count))
    return np.array(x)

annotations=glob.glob('data/OPP-115/annotations/*')

annotations_list=[]
for file in annotations:
    annotations_list.append(pd.read_csv(file, header=None))
    
annot=pd.concat(annotations_list, ignore_index=True)

annot.columns = ['annotation_ID', 'batch_ID', 'annotator_ID', 'policy_ID', 'segment_ID','category_name',
            'attribute_value_pairs','date','policy_URL']

annot['dict'] = annot.attribute_value_pairs.apply(lambda x: json.loads(x))

annot.head()

for x in annot['dict']:
    for key, value in x.items():
        value.pop('endIndexInSegment', None)
        value.pop('startIndexInSegment', None)
        
text=[]
category=[]
subcat=[]
label=[]
counter=0
for i,x in enumerate(annot['dict']):
    for key, value in x.items():
        subcat.append(key)
        if value.get('selectedText')==None:
            text.append('noSelectedText')
        else:
            text.append(value.get('selectedText'))  
        category.append(annot['category_name'][i])
        for k, v in value.items():
            if k=='value':
                label.append(v)

d=list(zip(text, category, subcat, label))

data=pd.DataFrame(d, columns=['text', 'category', 'subcategory', 'label'])

data['cat_sub'] = data['category'] +'-'+ data['subcategory']

data['cat_sub_lab'] = data['category'] +'-'+ data['subcategory'] +'-'+ data['label']

data['text'] = data['text'].apply(lambda x : remove_punct(x.lower()))

data = data[data['text']!='null']
data = data[data['text']!='noselectedtext']
data = data[data['text']!='']
data = data[data['text']!=' ']
data.reset_index(inplace=True, drop=True)
ds = data.drop_duplicates()

ds['tokenized'] = ds['text'].apply(lambda x : re.split(' ', x))

w2v = Word2Vec.load('word2vec.model')

word_vectors = w2v.wv

word2id = {k: v.index for k, v in word_vectors.vocab.items()}

x=get_int_data(ds.tokenized, word2id)

ds['enumerated_text']=x

embedding_matrix = word_vectors.vectors
unk_vector = embedding_matrix.mean(0)
embedding_matrix, word2id = add_new_word(UNK_TOKEN, unk_vector, UNK_INDEX, embedding_matrix, word2id)

max_length = max(ds.enumerated_text.apply(lambda x: len(x)))

padded_docs = pad_sequences(ds.enumerated_text, maxlen=max_length, padding='post', value=(len(embedding_matrix)-1))

Using TensorFlow backend.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Data created. Percentage of unknown words: 7006.000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [162]:
import pandas as pd

# category models
df = pd.read_excel("category_models.xlsx", header=None)
print(len(df))
category_models = { df[0][i] : df[1][i] for i in range(len(df)) }
print(len(category_models))

# subcategory models
df = pd.read_excel("subcategory_models_full.xlsx", header=None)
print(len(df))
subcategory_models = { df[0][i]: (df[1][i], df[2][i], df[3][i]) for i in range(len(df))}
print(subcategory_models)

10
10
36
{'Data Retention-Personal Information Type': ('data_retention_personal_information_type.h5', 'data_retention_personal_information_type_value.h5', 'data_retention_personal_information_type_value_encode.sav'), 'Data Retention-Retention Period': ('data_retention_rentention_period.h5', 'data_retention_rentention_period_value.h5', 'data_retention_rentention_period_value_encode.sav'), 'Data Retention-Retention Purpose': ('data_retention_retention_purpose.h5', 'data_retention_retention_purpose_value.h5', 'data_retention_retention_purpose_value_encode.sav'), 'Data Security-Security Measure': (nan, 'data_security_security_measure_value.h5', 'data_security_security_measure_value_encode.sav'), 'Do Not Track-Do Not Track policy': (nan, 'do_not_track_do_not_track_policy_value.h5', 'do_not_track_do_not_track_policy_value_encode.sav'), 'First Party Collection/Use-Action First-Party': ('first_party_collection_use_action_first_party.h5', 'first_party_collection_collection_mode_value.h5', 'firs

In [107]:
def get_int_input_text(token_text, word2id):
    temp = []
    unk_count = 0
    for word in token_text:
        if word in word2id:
            temp.append(word2id.get(word))
        else:
            temp.append(UNK_INDEX)
            unk_count += 1
    print('Data created. Percentage of unknown words: %.3f' % (unk_count))
    return np.array(temp)

In [108]:
def input_text(word2id, padded_docs, embedding_matrix):
    text = input('Enter text: ')
    text = remove_punct(text.lower())
    token_text=re.split(' ', text)
    print(len(token_text))
    if len(token_text) > padded_docs.shape[1]:
        print(f'Please give a shorter text. Please give {len(token_text)-padded_docs.shape[1]} less words.')
    else:
        enumerate_text = get_int_input_text(token_text, word2id)
        padded_text = pad_sequences([enumerate_text], maxlen=padded_docs.shape[1], padding='post', value=(len(embedding_matrix)-1))
    return padded_text

In [109]:
# def predict(enumerate_text, category_dict_model, subcategory_dict_model, value_dict_model):
#     labels = pd.DataFrame(columns=['category', 'subcategory', 'value'])
#     for key_cat, value_cat in category_dict_model:
#         model_cat = # load model which is in value_cat
#         pred_cat = model_cat.predict_classes(enumerate_text)
#         if pred_cat==1:
#             for key_subcat, value_subcat in subcategory_dict_model:
#                 model_subcat = # load model is in value_subcat
#                 pred_subcat = model_subcat.predict_classes(enumerate_text)
#                 if pred_subcat==1:
#                     model_value = # load model where value_dict_model[key_subcat]
#                     pred_value = model_value.predict_classes(enumerate_text)
#                     # need to check what pred_value is
#                     labels['category'].append(key_cat)
#                     labels['subcategory'].append(key_subcat)
#                     labels['value'].append(pred_value)
#     print(f'This text has: {labels}')

In [110]:
# value models
# df = pd.read_excel("value_models.xlsx", header=None)
# values_models = { df[0][i] : df[1][i] for i in range(len(df)) }

In [111]:
strings = input_text(word2id, padded_docs, embedding_matrix)

Enter text:  this information is collected on our website for first party use


11
Data created. Percentage of unknown words: 0.000


this information is collected on our website for first party use

In [112]:
strings.shape

(1, 289)

In [126]:
strings

array([[  25,    5,   30,   84,   19,   10,   52,   17,  525,   70,   16,
        3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932,
        3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932,
        3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932,
        3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932,
        3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932,
        3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932,
        3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932,
        3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932,
        3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932,
        3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932,
        3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932,
        3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932, 3932,
        3932, 3932, 3932, 3932, 3932, 

In [169]:
#### from keras.models import load_model
import pickle

def predict(enumerate_text, category_dict_model, subcategory_dict_model):
    labels = pd.DataFrame(columns=['category', 'subcategory', 'value'])    
    
    for key_cat, value_cat in category_dict_model.items():
        
        model_cat = load_model(value_cat) # load model which is in value_cat
        pred_cat = model_cat.predict_classes(enumerate_text)
        print(pred_cat)
        
        if pred_cat==1:
            
            for key_subcat, value_subcat in subcategory_dict_model.items():
                print(key_subcat)
                print(value_subcat[0])
                print(next(v for k,v in subcategory_dict_model.items() if key_cat in k))
                
                
#                 model_subcat = load_model(value_subcat[0]) # load model is in value_subcat
#                 pred_subcat = model_subcat.predict_classes(enumerate_text)
                
#                 if pred_subcat==1:
                    
#                     print(list(value_subcat.values()))
                    
#                     model_value = load_model(list(value_subcat.values())[0][1]) # load model is in value_value
#                     pred_value = model_value.predict_classes(enumerate_text)
                    
#                     loaded_encode = pickle.load(open(list(value_subcat.values())[0][2], 'rb')) # filename of encode file
#                     result = loaded_encode.inverse_transform(pred_value[0:1])
                    
#                     # need to check what pred_value is
#                     labels['category'].append(key_cat)
#                     labels['subcategory'].append(key_subcat)
#                     labels['value'].append(result)
                    
    print(f'This text has: {labels}')

In [None]:
predict(strings, category_models, subcategory_models)

[[0]]
[[1]]
Data Retention-Personal Information Type
data_retention_personal_information_type.h5
(nan, 'other_other_type_value.h5', 'other_other_type_value_encode.sav')
Data Retention-Retention Period
data_retention_rentention_period.h5
(nan, 'other_other_type_value.h5', 'other_other_type_value_encode.sav')
Data Retention-Retention Purpose
data_retention_retention_purpose.h5
(nan, 'other_other_type_value.h5', 'other_other_type_value_encode.sav')
Data Security-Security Measure
nan
(nan, 'other_other_type_value.h5', 'other_other_type_value_encode.sav')
Do Not Track-Do Not Track policy
nan
(nan, 'other_other_type_value.h5', 'other_other_type_value_encode.sav')
First Party Collection/Use-Action First-Party
first_party_collection_use_action_first_party.h5
(nan, 'other_other_type_value.h5', 'other_other_type_value_encode.sav')
First Party Collection/Use-Choice Scope
first_party_collection_use_choice_scope.h5
(nan, 'other_other_type_value.h5', 'other_other_type_value_encode.sav')
First Party 