In [1]:
################################################################################### 
# 
# This notebook uses augmented BERT to process doc similarity for a given document
#
###################################################################################

In [2]:
#################################  SELECTIONS   ###########################################
make = "FIAT"                                  # "" for no make selected
###########################################################################################

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
import numpy as np

In [5]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
stopword = nltk.corpus.stopwords.words('english')
stopword = stopword + ['bulletin','Bulletin', 'BULLETIN']
from nltk.stem.wordnet import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gianluigi/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/gianluigi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/gianluigi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [8]:
# import boto3
# import s3fs

In [9]:
import re
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
from sentence_transformers import SentenceTransformer
from sentence_transformers import models
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses
from torch.utils.data import DataLoader

In [11]:
colnames = ["BULNO", "BULREP", "ID", "BULDTE", "COMPNAME", "MAKETXT", "MODELTXT", "YEARTXT", "DATEA", "SUMMARY"]

df0 = pd.read_csv('Datasets/FLAT_TSBS.csv', sep='\t|\r|\n', encoding='iso-8859-1', header=None, names=colnames)

In [12]:
df0['KEY'] = range(0, 0+len(df0)) 

In [13]:
df1 = df0[["KEY","BULNO", "BULREP", "ID", "BULDTE", "COMPNAME", "MAKETXT", "MODELTXT", "YEARTXT", "DATEA", "SUMMARY"]]

In [14]:
df1 = df1.drop("BULREP", axis=1)            # Dropping column with many null values

In [15]:
data_types_dict = {"KEY":str,"BULNO":str, "ID":str, "COMPNAME":str,"BULDTE":str, "COMPNAME":str, 
                   "MAKETXT":str, "MODELTXT":str, "YEARTXT":str, "DATEA":str, "SUMMARY":str}
df1 = df1.astype(data_types_dict) 

In [16]:
print("Total Number of Bulletins in the Dataset:", len(df1))

Total Number of Bulletins in the Dataset: 3192528


In [17]:
df2 = df1.drop_duplicates(subset="SUMMARY")     # Dropping bulletins with identical summary

In [18]:
print("Total Number of Bulletins in the Dataset (no duplicates):", len(df2))

Total Number of Bulletins in the Dataset (no duplicates): 139778


In [19]:
if len(make) > 0:
    df2 =df2[df2["MAKETXT"] == make]
else:
    print("No Make Selected")

In [20]:

print("Make Selected:", make)

print("Total Number of Bulletins in Evaluation:", len(df2))


Make Selected: FORD
Total Number of Bulletins in Evaluation: 9507


In [21]:
df3 = df2.copy()          # keeping a copy of the dataset before preprocessing data

In [22]:
# Pre-process data

In [23]:
Column_Name = 'SUMMARY'
Clean_Column_Name = 'SUMMARY_CLEAN'
df3[Clean_Column_Name] =df3[Column_Name]

In [24]:
def remove_punctuation(text):
    string.punctuation = string.punctuation.replace('-', '')
    no_punct=[words for words in text if words not in string.punctuation]
    words_wo_punct=''.join(no_punct)
    return words_wo_punct

In [25]:
df3[Clean_Column_Name] = df3[Clean_Column_Name].apply(lambda x: remove_punctuation(x))

In [26]:
def remove_numbers(text):
    no_num = [word for word in text if not word.isdigit()]
    words_no_num = ''.join(no_num)
    return words_no_num

In [27]:
df3[Clean_Column_Name] = df3[Clean_Column_Name].apply(lambda x: remove_numbers(x))

In [28]:
def tokenize(text):
    split=re.split("\W+",text) 
    return split

In [29]:
df3[Clean_Column_Name]=df3[Clean_Column_Name].apply(lambda x: tokenize(x.lower()))

In [30]:
def remove_stopwords(text):
    text=[word for word in text if word not in stopword]
    return text

In [31]:
df3[Clean_Column_Name] = df3[Clean_Column_Name].apply(lambda x: remove_stopwords(x))

In [32]:
def lemmatize(text):
    lem = WordNetLemmatizer()
    lemma = [lem.lemmatize(word) for word in text] 
    lemmatized_text = " ".join(lemma)
    return lemmatized_text

In [33]:
df3[Clean_Column_Name] = df3[Clean_Column_Name].apply(lambda x: lemmatize(x))

In [34]:
df = df3.copy()

In [35]:
df3.to_csv('ServiceFiles/data_processed.csv', index=False)

In [36]:
# Setup the environment and import processed data

In [37]:
df = pd.read_csv('ServiceFiles/data_processed.csv')
df['SUMMARY_CLEAN'] = df['SUMMARY_CLEAN'].astype(str)

In [38]:
# Select make (Opt.), base document, and number of documents to analyze

In [39]:
if len(make) > 0:
    df =df[df["MAKETXT"] == make]
else:
    print("No Make Selected")

In [40]:
documents_df = df["SUMMARY_CLEAN"].values

In [41]:
# Train model

In [42]:
print('Train doc vectors')

Train doc vectors


In [43]:
tagged_data = [TaggedDocument(words=word_tokenize(doc), tags=[i]) for i, doc in enumerate(documents_df)]

model = Doc2Vec(vector_size=100,epochs=10, alpha=0.025, min_count=1)
  
model.build_vocab(tagged_data)

model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.epochs)

In [44]:
document_embeddings=np.zeros((documents_df.shape[0],100))

for i in range(len(document_embeddings)):
    document_embeddings[i]=model.docvecs[i]
    
    
pairwise_similarities = cosine_similarity(document_embeddings)


In [45]:
doc2vec = pd.DataFrame(document_embeddings)

In [46]:
doc2vec.to_csv('ServiceFiles/doc2vec.csv', index=False)

In [47]:
print('Done with doc vectors')

Done with doc vectors


In [48]:
# %run bert_embedding.ipynb

In [49]:
word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=256)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [50]:
df = pd.read_csv('ServiceFiles/data_processed.csv')
doc2vec = pd.read_csv('ServiceFiles/doc2vec.csv')

In [51]:
csm = cosine_similarity(doc2vec)

my_dict = {}
for i in range(len(csm)):
    for j, elem in enumerate(csm[i]):
        if(elem) and (j, i) not in my_dict:
            my_dict[(i,j)] = elem
            

In [52]:
len(my_dict)

45196278

In [53]:
dd = pd.DataFrame.from_dict(my_dict,orient='index')
dd.index = pd.MultiIndex.from_tuples(dd.index)

In [54]:
dd.to_csv('ServiceFiles/doc2vec_csm_values.csv', index=False)

In [55]:
csm_sample = dd   # employ to use less vectors

In [56]:
csm_sample.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 45196278 entries, (0, 0) to (9506, 9506)
Data columns (total 1 columns):
 #   Column  Dtype  
---  ------  -----  
 0   0       float64
dtypes: float64(1)
memory usage: 517.9 MB


In [57]:

train_examples = []
for idx, summary in zip(csm_sample, df): 
    
    train_examples.append(InputExample(texts=[str(df['SUMMARY_CLEAN'].to_frame().loc[csm_sample.index.get_level_values(0).values]), str(df['SUMMARY_CLEAN'].to_frame().loc[csm_sample.index.get_level_values(1).values])], label = csm_sample.values))

In [58]:
print('Begin training BERT')


#Define the model. Either from scratch of by loading a pre-trained model
model = SentenceTransformer('nli-bert-large')

#Define your train dataset, the dataloader and the train loss
train_dataset = SentencesDataset(train_examples, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16)
train_loss = losses.BatchAllTripletLoss(model)

#Tune the model
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10, warmup_steps=100)

Begin training BERT


Iteration: 100%|██████████| 1/1 [01:49<00:00, 109.54s/it]
Iteration: 100%|██████████| 1/1 [01:34<00:00, 94.26s/it]
Iteration: 100%|██████████| 1/1 [01:41<00:00, 101.59s/it]
Iteration: 100%|██████████| 1/1 [01:47<00:00, 107.28s/it]
Iteration: 100%|██████████| 1/1 [01:41<00:00, 101.34s/it]
Iteration: 100%|██████████| 1/1 [01:35<00:00, 95.56s/it]
Iteration: 100%|██████████| 1/1 [01:44<00:00, 104.34s/it]
Iteration: 100%|██████████| 1/1 [01:34<00:00, 94.43s/it]
Iteration: 100%|██████████| 1/1 [01:40<00:00, 100.66s/it]
Iteration: 100%|██████████| 1/1 [01:25<00:00, 85.38s/it]
Epoch: 100%|██████████| 10/10 [16:37<00:00, 99.73s/it]


In [59]:
augmented_embeddings = pd.DataFrame(model.encode(df['SUMMARY_CLEAN']))

In [60]:
augmented_embeddings.to_csv('ServiceFiles/bert_tuned_embedding.csv',index=False)

In [61]:
print('End training BERT')

End training BERT


In [62]:
documents_df = df["SUMMARY_CLEAN"].values

In [63]:
# Train model

In [64]:
embeddings = pd.read_csv('ServiceFiles/bert_tuned_embedding.csv')

In [65]:
# Generate similarity matrix

In [66]:
pairwise_similarities=cosine_similarity(embeddings)

In [67]:
pairwise_similarity_matrix = pd.DataFrame(pairwise_similarities, columns=df.iloc[:,0], index=df.iloc[:,0] )

In [68]:
# Show results

In [69]:
dd1 = df
sample_docs = dd1['KEY'].to_numpy()

In [73]:
# When changing reference document (without changing car make) start from here
base_document_bulno = 'SB-18-030-15REVA'                              

In [74]:

base_document_row = df.query("BULNO ==  @base_document_bulno ")

In [75]:
base_document_key = base_document_row.iloc[0,0]

IndexError: single positional indexer is out-of-bounds

In [None]:


report_all =  pd.DataFrame(columns=["Key","Bulletin Number", "Make" , "Model", "Year", "Summary"])

results = pd.DataFrame(pairwise_similarity_matrix.loc[:,base_document_key]
                       .sort_values(axis=0, ascending=False))

results10 = results.iloc[0:11,:]

results10 = results10.reset_index()

bul_list = results10['KEY'].to_numpy()

myList = []
    

for comparison_doc in bul_list:
    Key = df.loc[df["KEY"] == comparison_doc].KEY.to_string(index=False)
    Bulletin_Number = df.loc[df["KEY"] == comparison_doc].BULNO.to_string(index=False)
    Make = df.loc[df["KEY"] == comparison_doc].MAKETXT.to_string(index=False)
    Model = df.loc[df["KEY"] == comparison_doc].MODELTXT.to_string(index=False)
    Year = df.loc[df["KEY"] == comparison_doc].YEARTXT.to_string(index=False)
    Summary = df.loc[df["KEY"] == comparison_doc].SUMMARY.to_string(index=False)
    myList.append([Key, Bulletin_Number,Make,Model, Year, Summary])
    
report = pd.DataFrame(myList, columns=["Key","Bulletin Number", "Make" , "Model", "Year", "Summary"])
    

In [None]:
report

In [None]:
results10

results['Order']= np.arange(len(results))
import matplotlib.pyplot as plt

xpoints = results.iloc[:,1]
ypoints = results.iloc[:,0]

plt.axhline(y=0.8, color='r', linestyle='-')

plt.plot(xpoints, ypoints)
plt.show()