In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')
from src.tosclassifier import *

In [None]:
tos = ToS_Classifier('tosdr.org/api/1/service')

In [None]:
df = tos.create_df()

In [None]:
df = df.dropna(axis=0,subset=['quoteText'])

In [None]:
df['label'] = df['point']
df.label = df.label.replace(['blocker','bad','neutral','good'],[1,1,0,0])

In [None]:
df = df.astype(str)

In [None]:
df.info()

In [None]:
companies = set(df.services)

In [None]:
tfiddy = TfidfVectorizer()
tfiddy.fit(df['quoteText'])
springcleaning = tfiddy.build_analyzer()
after = springcleaning(df['quoteText'][0])

In [None]:
after

In [None]:
dirty_companies = [company.split('.') for company in companies]
clean_companies = [springcleaning(company[0]) for company in dirty_companies]
cleanest_companies = [company for sublist in clean_companies for company in sublist]

In [None]:
df.loc[:,['quoteText','title']].to_csv('data/tos_summary.csv', index=False)

In [None]:
df.loc[:,['quoteText','title']].to_csv('data/tos_str.csv', index=False)

In [None]:
df.to_csv('data/tos_df.csv', index=False)  

In [None]:
X = df['quoteText']
y = df['case']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)

In [None]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [None]:
tokenized = X_train.apply((lambda x: tokenizer.encode(x, add_special_tokens=True,max_length=144)))

In [None]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [None]:
padded

In [None]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask

In [None]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [None]:
features = last_hidden_states[0][:,0,:].numpy()

In [None]:
labels = y_train

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [None]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

In [None]:
lr_clf.score(test_features, test_labels)

In [None]:
companies,data_list = gather_data('tosdr.org/api/1/service')
dirty_companies = [company.split('.') for company in companies]
clean_companies = [springcleaning(company[0]) for company in dirty_companies]
cleanest_companies = [company for sublist in clean_companies for company in sublist]

In [None]:
tfiddy = TfidfVectorizer(stop_words=cleanest_companies)
tfiddy.fit(df['document'])
springcleaning = tfiddy.build_analyzer()

In [25]:
%run src/tossummarizer

Unique encoder tokens: 119
Unique decoder tokens: 46
Max encoder seq length: 8566
Max decoder seq length: 142
Avg encoder seq length: 290.8672131147541
Avg decoder seq length: 72.76639344262296
[<tf.Tensor 'encoder_17/transpose_1:0' shape=(None, None, 256) dtype=float32>, <tf.Tensor 'encoder_17/while:4' shape=(None, 256) dtype=float32>, <tf.Tensor 'encoder_17/while:5' shape=(None, 256) dtype=float32>]
Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_inputs (InputLayer)     (None, None, 119)    0                                            
__________________________________________________________________________________________________
decoder_inputs (InputLayer)     (None, None, 46)     0                                            
___________________________________________________________________________________

In [11]:
from tensorflow.keras.layers import Attention

In [12]:
attn_layer = Attention(name='attention_layer')([encoder_out, decoder_out])

NameError: name 'encoder_out' is not defined