# Installing BERTopic

We start by installing BERTopic from PyPi:

In [None]:
%%capture
!pip install bertopic

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!nvidia-smi

In [None]:
import pandas as pd
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups

data = pd.read_csv('/content/train_pii_data.csv',index_col=0 )
docs = data["text"]


In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
data['labels'] = le.fit_transform(data['encryption_class'])
data

In [None]:
targets = data["labels"]
target_names = list(data['encryption_class'].unique())
classes = data['encryption_class']

Each document can be put into one of the following categories:

In [None]:
target_names

# **(semi)-Supervised modeling**


## Basic Model
Before we start with semi-supervised modeling, let us first take a look at the output of the basic model.

In [None]:
from umap import UMAP


umap_model = UMAP(n_neighbors=30, n_components=len(target_names), min_dist=0.0, metric='cosine')

topic_model = BERTopic(umap_model=umap_model,language = "multilingual",verbose=True,low_memory=True, calculate_probabilities=False,nr_topics=9)
topics, _ = topic_model.fit_transform(docs)

In [None]:
topic_model.get_topic_info()

The topics that were created mostly make sense. There are some clearly defined topics such as "nasa, orbit, spacecraft, moon" but also some topics that seem mostly derived from other topics. We can visualize this by extracting the topic representations per class and see if our unsupervised model closely resembles this. 

**NOTE**: You can **hover** over the bars to see the representation per class!!

In [None]:
topics_per_class = topic_model.topics_per_class(docs, topics, classes=classes)
topic_model.visualize_topics_per_class(topics_per_class)


The results do seem promising. Topics like "nasa, space, etc" seem to be clearly related to sci.space, but some topics were created that span many categories. For example, we expect the topic "bike, bikes, etc"  to only appear in rec.motorcycles.  

##  PII WITH LABELS

## Supervised

Finally, we are going to be using all labels. These labels help BERTopic understand where most clusters can be found. However, this does not mean that it will only find the 20 clusters that we have defined. If there are sub-clusters to be found, then there is a good chance BERTopic will find them! 

In [None]:
# !pip install flair

In [None]:
#topic_model = BERTopic(verbose=True)
from umap import UMAP
from hdbscan import HDBSCAN
# from sentence_transformers import SentenceTransformer

# sentence_model = SentenceTransformer("xlm-r-bert-base-nli-stsb-mean-tokens", device="cuda")

umap_model = UMAP(n_neighbors=16, n_components=5, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

topic_model = BERTopic(umap_model=umap_model,hdbscan_model=hdbscan_model,language = "multilingual",verbose=True,low_memory=True, calculate_probabilities=True,min_topic_size=16)
topics, prob = topic_model.fit_transform(docs, y=targets)

In [None]:
# umap_model.plot.output_notebook()

In [None]:
topic_model.get_topic_info()

In [None]:
# R Dimentionality
new_topics, new_probs = topic_model.reduce_topics(docs, topics, prob , nr_topics=8)

In [None]:
sample_df = pd.DataFrame({'text':docs,'labels':topics})

In [None]:
sample_df.head(50)

In [None]:
# from sklearn.feature_extraction.text import CountVectorizer

# cv = CountVectorizer(ngram_range=(2, 3), stop_words="english")
# topic_model.update_topics(docs, topics, vectorizer_model=cv)


In [None]:
# topic_model.get_topic_info()

In [None]:
topic_model.get_topic_info()

Not only do we see a nice seperation of the topics, there are significantly less outliers which shows that BERTopic has improved in connecting the documents to topics. 

Let's see the results by again visualizing the topic representation per class:

In [None]:
topic_model.visualize_topics()

In [None]:
# topics_per_class = topic_model.topics_per_class(docs, topics, classes=classes)
# topic_model.visualize_topics_per_class(topics_per_class)

# PII INDENTIFICATION

In [None]:
all_topics = topic_model.get_topic_info().Topic
for topic in all_topics:
    print('topic_cluster: {}'.format(topic),topic_model.get_topic(topic=topic))


In [None]:
topic_dict = {-1:'admin_info',0:'person_info',1:'org_info',2:'contact_info',3:'other_info',4:'sales_info',5:'flags',6:'address',7:'id',}


In [None]:
try_text = 'social security  12321-12-1223'
predictions = topic_model.find_topics(try_text)

topic_cluster = predictions[0][0]
#print(topic_dict.get(5))
print(topic_cluster)
topic_model.get_topic(topic=topic_cluster)


In [None]:
#topic_dict.get(0)

In [None]:
try_text = 'name'
topic_model.find_topics(try_text)


In [None]:
topic_model.get_topic(5)

In [None]:
# SAVE and LOAD the model to use

# topic_model.save('pii_bert_topic_finetuned.bin')

## GET OUTPUT RESULTS

In [None]:
import pandas as pd
from bertopic import BERTopic
import torch

map_location=torch.device('cpu')

#model = torch.load(,map_location=map_location)


model_path = "/content/drive/MyDrive/finetunded_topic_models/pii_model/pii_bert_topic_finetuned.bin"



def load_model(model_path):
    model = BERTopic.load(model_path)
    return model

model = load_model(model_path)

  defaults = yaml.load(f)


In [None]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,170,-1_flag_street_code_hpid
1,0,158,0_reseller_company_customer_sales
2,1,78,1_id_db_owner_id_ez_case_id
3,2,71,2_date_2017_delivery_scheduled
4,3,67,3_email_phone_fax_call
5,4,62,4_competitor2_competitor1_reserve10_reserve6
6,5,62,5_prm_modified_contract_modified_by
7,6,50,6_id_ez_db_database_customer
8,7,48,7_job_office_department_prefectures


In [None]:
all_topics = model.get_topic_info().Topic
for topic in all_topics:
    print('topic_cluster: {}'.format(topic),model.get_topic(topic=topic))


topic_cluster: -1 [('flag', 0.07063912520673252), ('street', 0.04358462308510711), ('code', 0.03529247805863117), ('hpid', 0.03216709842992705), ('eclipse', 0.02576309027027677), ('ship', 0.02392832280886343), ('111', 0.02337059940702498), ('lease', 0.02337059940702498), ('千葉市', 0.02337059940702498), ('postal', 0.02337059940702498)]
topic_cluster: 0 [('reseller', 0.05827588527924397), ('company', 0.04208813936834287), ('customer', 0.03480984460160604), ('sales', 0.0324453705365454), ('revenue', 0.03239432912283779), ('owned', 0.03239432912283779), ('isr', 0.0292806933862585), ('osr', 0.028345037982483064), ('discount', 0.027912921411643213), ('teleseller', 0.022598826218519042)]
topic_cluster: 1 [('id', 0.1313311766816959), ('db', 0.0884676665299225), ('owner_id', 0.034380041714163236), ('ez_case_id', 0.02757124979865106), ('ic17355', 0.02757124979865106), ('id2', 0.02757124979865106), ('location_id', 0.02757124979865106), ('zip_cd', 0.02757124979865106), ('extend6', 0.0275712497986510

In [None]:
# for first finetuned model

def get_pii_info(model,text,top_n=False):
    
    result = {}
    # Classifying the lables based on cluster
    topic_dict = {-1:'address',0:'product_info',1:'id',2:'date_info',3:'contact_info',4:'order_info',5:'admin_info',6:'other_info',6:'id',7:'org_info'}
    is_pii = {'True':[-1,1,3,5,6],'False':[0,2,4,7]}

    # Calling the model to get predictions
    predictions = model.find_topics(text)
    topic_cluster = predictions[0][0]
    confidence_score = predictions[1][0]

    # Mapping the topic cluster to is_pii dict
    pii = [key for key,value in is_pii.items() if topic_cluster in value]
    pii = ''.join(map(str, pii))

    #print(pii)

    # mapping the topic cluster to the model    
    result['is_pii'] = pii
    result['label'] = topic_dict.get(topic_cluster)
    result['confidence_score'] = confidence_score
    result['related_info'] = dict(model.get_topic(topic=topic_cluster))
    
    # to enable other top candidates
    if top_n == True:
        result['all_info'] = dict(predictions)

    return  result



In [None]:
#input_text = ' my IP address is 182.212.12.13'
#input_text = ' HP had sell of 1.4 million dollars in Japan '
#input_text = 'My name is Dipanjan'
#input_text = 'kolkata'
#input_text = ' asadq@kkr.in'
#input_text = 'hp elitebook 5700'
#input_text = 'social security number 12321-12-1223'

#input_text = 'mac id is 123:1230:asc'
input_text = ' xzy is  1231138101'

#model = load_model(model_path)
result = get_pii_info(model,input_text,top_n=False)
result

{'confidence_score': 0.38603105995530035,
 'is_pii': 'True',
 'label': 'id',
 'related_info': {'db': 0.0884676665299225,
  'extend2': 0.02757124979865106,
  'extend6': 0.02757124979865106,
  'ez_case_id': 0.02757124979865106,
  'ic17355': 0.02757124979865106,
  'id': 0.1313311766816959,
  'id2': 0.02757124979865106,
  'location_id': 0.02757124979865106,
  'owner_id': 0.034380041714163236,
  'zip_cd': 0.02757124979865106}}

In [None]:
# !cd /content

In [None]:
# def get_pii_info(model,text,top_n=False):
    
#     result = {}
#     # Classifying the lables based on cluster
#     topic_dict = {-1:'address',0:'product_info',1:'id',2:'date_info',3:'contact_info',4:'order_info',5:'admin_info',6:'other_info',6:'id',7:'org_info'}
#     is_pii = {'True':[-1,1,3,5,6],'False':[0,2,4,7]}

#     # Calling the model to get predictions
#     predictions = model.find_topics(text)
#     topic_cluster = predictions[0][0]
#     confidence_score = predictions[1][0]

#     # Mapping the topic cluster to is_pii dict
#     pii = [key for key,value in is_pii.items() if topic_cluster in value]
#     pii = ''.join(map(str, pii))

#     #print(pii)

#     # mapping the topic cluster to the model    
#     result['is_pii'] = pii
#     result['label'] = topic_dict.get(topic_cluster)
#     result['confidence_score'] = confidence_score
#     result['related_info'] = dict(model.get_topic(topic=topic_cluster))
    
#     # to enable other top candidates
#     if top_n == True:
#         result['all_info'] = dict(predictions)

#     return  result



In [None]:
resserc1 123145123