## Prediction pipeline 

In [2]:
# !pip install pandas
# !pip install numpy
# !pip install torch
# !pip install scikit-learn
# !pip install transformers
# !pip install spacy
# ! pip install en_core_web_sm-3.1.0-py3-none-any.whl
# !pip install sentence_transformers

In [3]:
import pandas as pd
import numpy as np
import torch
import re
import pickle
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import BertForSequenceClassification, BertTokenizer
import spacy
import en_core_web_sm
import string
nlp = spacy.load("en_core_web_sm")
from sklearn.metrics.pairwise import cosine_distances


### text preprocess

In [4]:
def count_intnt_entits(text):
    if str(text).isnumeric():
        return 0,0    
    try:
        doc = nlp(str(text))
        intents = [token.text for token in doc if token.pos_ == 'VERB']
        entities = [token.text for token in doc if token.pos_ in {'NOUN', 'PROPN', 'ADJ', 'NUM', 'ADV'}]
    except:
        print(text)
        raise
    return len(intents), len(entities)

def extract_ner_entities(sentence):
    doc = nlp(str(sentence))
    entities = [(entity.text, entity.label_) for entity in doc.ents]
    return entities

def length_entities(list_entities):
    if (list_entities==np.nan or list_entities==None or list_entities==''):
        return 0
    else:
        return len(list_entities)
    
def filter_named_entities(text):
    # Process the text using Spacy
    doc = nlp(str(text))
    # Filter out named entities (ORG, PERSON, and GPE tags)
    filtered_words = [token.text for token in doc if token.ent_type_ not in ['ORG', 'PERSON', 'GPE', "LOC", "FAC"]]
    # Join the filtered words back into a string
    filtered_text = ' '.join(filtered_words)
    return filtered_text

def text_preprocess(col):
    df = pd.DataFrame({ 'text': col })
    df = df.drop_duplicates()
    df['text'] = df['text'].str.replace('\d+', '')
    df[['no_of_intents', 'no_of_entities']] = df.apply(lambda x: pd.Series(count_intnt_entits(x['text'])), axis=1)  

    df['ner_enities'] = ''
    df.loc[df['text']!='', 'ner_enities'] = df.loc[df['text']!='', 'text'].apply(extract_ner_entities)
    df['len_ner_enities'] = df['ner_enities'].apply(length_entities)
    df3 = df[df['len_ner_enities']>0]
    df3['text'] = df3['text'].apply(filter_named_entities)
    df6 = pd.concat([df[df['len_ner_enities']==0], df3], axis = 0)
    df6 = df6.drop(['no_of_intents','no_of_entities','ner_enities','len_ner_enities'], axis=1)

    df6['text'] = df6['text'].str.strip()
    
    return df6['text'].to_list()

def clean_text(text_list):
    # Clean the text
    text_list = text_preprocess(text_list)
    #text_list = [text for text in text_list if text.strip() and not set(text).issubset(set(string.punctuation + string.whitespace))]
    text_list1 = []
    for text in text_list:
        if isinstance(text, str):
            if text.strip() and not set(text).issubset(set(string.punctuation + string.whitespace)):
                text_list1.append(text)
            
    text_list = text_list1
    
    text_list = [x.lower() for x in text_list]
    # Define a translation table to replace punctuation and special characters with empty string
    translator = str.maketrans(string.punctuation + "_", " " * len(string.punctuation + "_"))
    # Loop through each text in the list and clean it
    cleaned_list = []
    for text in text_list:
        # Replace punctuation and special characters with empty string
        cleaned_text = text.translate(translator)
        # Remove any remaining special characters, punctuation, or whitespaces
        cleaned_text = ' '.join(cleaned_text.split())
        cleaned_list.append(cleaned_text)
    
    return cleaned_list

## labelling unseen data for matrix after prediction

In [5]:
import sentence_transformers

In [6]:
from sentence_transformers import SentenceTransformer

In [7]:
model = SentenceTransformer('bert-base-nli-mean-tokens')

In [8]:
df_unseen = pd.read_parquet('web_new.parquet')

In [9]:
df_unseen.head(5)

Unnamed: 0,web_page_detail_event_key,web_page_detail_event_key_columns,client_key,implementation_key,participant_key,platform_id,client_id,implementation_hierarchy_key,person_id,person_internal_id,...,page_process_detail,iva_used_indicator,event_position,action_type,load_timestamp,source_system_code,mobile_app_device_type,page_load_elapsed_time,mobile_app_type,session_create_date_cst_timezone_partition
891191,905A1CB23A6B143336DDD260CB49F2E6_3349#2023-04-...,web_session_id#session_create_timestamp_cst_ti...,13349,334933493jxgemzn5qhzvwzg1avc42clfkdt05vcylcnk1...,participant_key,3349,3349,33493349,person_id,384470017,...,Home,0,,Button Click,2023-04-26 05:57:46,2n,,,Unused,2023-04-25
735128,A6B3302CB400D34A0CE098C1BC34F24A_1215#2023-04-...,web_session_id#session_create_timestamp_cst_ti...,11215,121512153jxgemzn5qhzvwzg1avc42clfkdt05vcylcnk1...,participant_key,1215,1215,12151215,person_id,351441044,...,GMC,0,,PAGEVIEW,2023-04-26 05:57:46,2n,,143.0,Unused,2023-04-25
775969,3B6B469B5451D94D833D5294AEFBF7E0_1122#2023-04-...,web_session_id#session_create_timestamp_cst_ti...,11122,112211223jxgemzn5qhzvwzg1avc42clfkdt05vcylcnk1...,participant_key,1122,1122,11221122,person_id,152200078,...,Home,0,,PAGEVIEW,2023-04-26 05:57:46,2n,,663.0,Unused,2023-04-25
818964,00722DFC4D4500D18D5B379173AA7541.42327_LR_PU_C...,web_session_id#session_create_timestamp_cst_ti...,12346,234623463jxgemzn5qhzvwzg1avc42clfkdt05vcylcnk1...,participant_key,2346,2346,23462346,person_id,344960009,...,Single Sign On Pass,0,,Internal Link Click,2023-04-26 05:57:46,2o,,1.0,Unused,2023-04-25
766529,3F7E6BF225A2C44E74C0D5825F244C7B_16707#2023-04...,web_session_id#session_create_timestamp_cst_ti...,116707,16707167073jxgemzn5qhzvwzg1avc42clfkdt05vcylcn...,participant_key,16707,16707,1670716707,person_id,51100057,...,Mobile App Dashboard,0,,On Screen Actions,2023-04-26 05:57:46,2a,"iOS16.1.1iPhone13,2",,Unused,2023-04-25


In [10]:
df_unseen = df_unseen[['business_page_name']]

In [11]:
df_unseen.head()

Unnamed: 0,business_page_name
891191,homepage
735128,gmc
775969,homepage
818964,SSO
766529,Common: Home Screen


In [14]:
df_unseen.shape

(17345, 1)

In [12]:
# df_search_unseen_cleaned = df_search_unseen.copy()

In [16]:
# df_search_unseen_cleaned = df_search_unseen.copy()
# df_search_unseen_cleaned['cleaned_search_text'] = clean_text(df_search_unseen_cleaned['search_text'].tolist())
import time

# Start the timer
start_time = time.time()

cleaned_text_list = clean_text(df_unseen['business_page_name'].to_list())

elapsed_time = time.time() - start_time
print(elapsed_time)



10.409914255142212


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [19]:
df_unseen_cleaned = pd.DataFrame()
df_unseen_cleaned['business_page_name'] = cleaned_text_list

In [20]:
df_unseen_cleaned.head()

Unnamed: 0,business_page_name
0,homepage
1,sso
2,common home screen
3,hmportalenrollerrorpage
4,common shortcut cards screen


In [21]:
words_3 = ['grand parents','elder','elder women',
 'silver generation','elder',
 'retiree','Pensioner','Mature adults','Octogenarians','Nonagenarians','Centenarians',
'elderly people',
 'senior assistance',
 'grey generation',
 'silver generation',
 'senior health',
 'elderly companion',
 'senior citizen',
 'elder support',
 'elderly',
 'senior members',
 'elder population',
 'elderly residents',
 'senior assistance',
 'grey generation',
 'elder statesmen',
 'elderly',
 'elderly people',
 'elderly residents',
 'elder',
 'elder women',
 'senior','senior citizen',
 'elder generation',
 'gerontology',
 'elderly population',
 'senior members',
 'retirees',
 'elderly population',
 'eldercare',
 'geriatric',
 'elder statesmen',
 'retirees',
 'elder population',
'eldercae', 'eldercarr', 'eldermann',
'eldercre','eldery','elderman','elders','eldercrae']
words_4 = list(set([word.lower() for word in words_3]))
len(words_4)

37

In [22]:
# words_3 = ['day care', 'creche', 'childcare','daycare','nurseryschool','after school care',
#            'pre school', 'child', 'baby', 'minor care','minorcare','offspring',
#            'daynursery', 'infantschool','pre K', 'childcarer', 'childcarers','girl child',
#  'infantcare', 'play school', 'playgroup','boy child','Adolescent','Little one','Young one',
#  'nursery', 'nursery school', 'preschool', 'after schoolcare', 'day nursery', 'infant school', 
#  'infant care', 'playschool', 'play group', 'kindergarten', 'childminding', 'babysitting',
#            'babysitter', 
#  'nanny care', 'children supervision', 'toddler care', 'prekindergarten',
#            'Junior care','Sprout care',
# 'child minding', 'baby sitting', 'baby sitter', 'children',
#  'nannycare', 'childrensupervision','child supervision', 'childsupervision', 
#            'toddlercare','foster', 'childs','childrens',
#            'stepchild','step daughter', 'step son', 'grandchildren','grandchild',
#           'stepchildren','childbirth',"children's", 'childhood',
#           'childsupport','childcarereimbursement','mychildren',
#            'mychild','childcareplus','childcarea','stepchildren',
#            'childbirthing','dependentchild',
#           'childcard',
#           'childplus','childbonding']
# words_4 = list(set([word.lower() for word in words_3]))

# len(words_4)


In [27]:
df_unseen_cleaned = df_unseen_cleaned.rename(
    columns = {'business_page_name':'text'})

In [28]:
df_unseen_cleaned.head()

Unnamed: 0,text
0,homepage
1,sso
2,common home screen
3,hmportalenrollerrorpage
4,common shortcut cards screen


In [29]:
mask = (df_unseen_cleaned['text'].str.contains(
    r'\b(' + '|'.join(words_4) + r')\b', case=False, na=False))

df_unseen_cleaned['category'] = ''
df_unseen_cleaned.loc[mask, 'category'] = 'Elder care'
# df_combined_web_iva_search.loc[df_combined_web_iva_search['category'] == '', 'category'] = 'Other'

df_unseen_cleaned.head(5)

  


Unnamed: 0,text,category
0,homepage,
1,sso,
2,common home screen,
3,hmportalenrollerrorpage,
4,common shortcut cards screen,


In [30]:
## get text which are similar to phrases in synonnyms list for texts other than which are filtered above
# 86 threshold for cc model
# def find_similar_sentences(df, sentences, phrases, threshold=0.86, category_name = 'Child care'):
#     # encode the phrases using the model
#     phrase_embeddings = model.encode(phrases, convert_to_tensor=True)
    
#     # initialize an empty list to store the similar sentences
#     similar_sentences = []
    
#     # iterate over the sentences
#     for sentence in sentences:
#         # encode the sentence using the model
#         sentence_embedding = model.encode(sentence, convert_to_tensor=True)
#         # reshape the sentence embedding to a 2D array
#         sentence_embedding = sentence_embedding.reshape(1, -1)
        
#         # calculate the cosine similarity between the sentence embedding and each phrase embedding
#         cosine_scores = 1 - cosine_distances(sentence_embedding, phrase_embeddings)
        
#         # convert the cosine similarity scores to a list
#         scores_list = cosine_scores.tolist()[0]
        
#         # iterate over the phrases and similarity scores and append the sentence to the list if it meets the threshold for at least one phrase
#         for phrase, score in zip(phrases, scores_list):
#             if score >= threshold:
#                 similar_sentences.append(sentence)
#                 break
    
#     # convert the list of similar sentences to a set to remove duplicates
#     similar_sentences = set(similar_sentences)
    
#     # create a new dataframe containing only the rows with text that is in the set of similar sentences
#     similar_df = df[df['text'].isin(similar_sentences)]
#     similar_df['category']=category_name
#     return similar_df


In [31]:
def find_similar_sentences(df, sentences, phrases, threshold=0.90, category_name = 'Elder care'):
    # encode the phrases using the model
    phrase_embeddings = model.encode(phrases, convert_to_tensor=True)
    
    # initialize an empty list to store the similar sentences
    similar_sentences = []
    
    # iterate over the sentences
    for sentence in sentences:
        # encode the sentence using the model
        sentence_embedding = model.encode(sentence, convert_to_tensor=True)
        # reshape the sentence embedding to a 2D array
        sentence_embedding = sentence_embedding.reshape(1, -1)
        
        # calculate the cosine similarity between the sentence embedding and each phrase embedding
        cosine_scores = 1 - cosine_distances(sentence_embedding, phrase_embeddings)
        
        # convert the cosine similarity scores to a list
        scores_list = cosine_scores.tolist()[0]
        
        # iterate over the phrases and similarity scores and append the sentence to the list if it meets the threshold for at least one phrase
        for phrase, score in zip(phrases, scores_list):
            if score >= threshold:
                similar_sentences.append(sentence)
                break
    
    # convert the list of similar sentences to a set to remove duplicates
    similar_sentences = set(similar_sentences)
    
    # create a new dataframe containing only the rows with text that is in the set of similar sentences
    similar_df = df[df['text'].isin(similar_sentences)]
    similar_df['category']=category_name
    return similar_df


In [32]:

df_unseen_cleaned = df_unseen_cleaned.dropna(subset=['text'])

In [33]:
similar_df_unseen = find_similar_sentences(df_unseen_cleaned, 
                                    df_unseen_cleaned[df_unseen_cleaned['category']=='']['text'].to_list(), 
                                    words_4)

In [34]:
similar_df_unseen

Unnamed: 0,text,category


In [35]:
unseen_EC_df = pd.concat([df_unseen_cleaned[df_unseen_cleaned['category']=='Elder care'], 
                       similar_df_unseen]).sample(frac=1)

In [36]:
unseen_EC_df.shape

(0, 2)

In [37]:
unseen_EC_df.drop_duplicates(inplace=True)

Unnamed: 0,text,category


In [38]:
unseen_EC_df.shape

(0, 2)

In [39]:
# filter out the rows with similar text from the original DataFrame
non_similar_unseen_df = df_unseen_cleaned[~df_unseen_cleaned['text'].isin(unseen_EC_df['text'])]

# sample twice as many rows from the non-similar DataFrame as there are in the similar DataFrame
# non_similar_df = non_similar_df.sample(n=only_EC_df.shape[0]*2)
non_similar_unseen_df['category'] = 'Other'

In [40]:
df_unseen_concatenated = pd.concat([unseen_EC_df, non_similar_unseen_df]).sample(frac=1).reset_index(drop=True)
# df_concatenated = df_concatenated
# df_concatenated.drop(columns=['input', 'search_text', 'page_name'], inplace=True)

In [41]:
pd.set_option('display.max_colwidth', None)

In [42]:
df_unseen_concatenated.shape

(581, 2)

In [44]:
df_unseen_concatenated[(df_unseen_concatenated.text.str.contains('elder')) & (df_unseen_concatenated.category=='Other')]

Unnamed: 0,text,category


In [45]:
pd.reset_option('display.max_colwidth')

In [46]:
df_unseen_concatenated.to_excel('labelled_unseen_data.xlsx')

## prediction on labelled unseen data

In [48]:
import torch.nn as nn
# Load the saved model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model.load_state_dict(torch.load("Bert_EC_model/ec_model_10_epoch.pth"))

# Load the input Excel file
df = pd.read_excel('labelled_unseen_data.xlsx')
cleaned_text_list = df['text'].to_list()
category_list = df['category'].to_list()

# Make predictions for each text in the Excel file
predictions = []
for i in range(len(cleaned_text_list)):
    text = cleaned_text_list[i]
    category = category_list[i]
    input_ids = torch.tensor([tokenizer.encode(text, add_special_tokens=True)])
    attention_mask = torch.tensor([[int(token_id > 0) for token_id in input_ids[0]]])
    logits = model(input_ids, attention_mask).logits
    probs = nn.functional.softmax(logits, dim=-1)
    predicted_label = torch.argmax(probs, dim=-1)
    if predicted_label == 1:
        predictions.append((text, 'Elder care', category, probs[0][1].item()))
    else:
        predictions.append((text, 'Other', category, probs[0][0].item()))

# Save the predictions to a new file
df_pred = pd.DataFrame(predictions, columns=['text', 'prediction', 'category', 'probability'])
df_pred.to_excel('bert_labelled_ec_web_pred_1.xlsx', index=False)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

## Elder care prediction

In [11]:
# list_of_texts = ["HmCstmElderCarePlusLandingPageOpen", "senior-citizen?care expense reimbursement", 
#                  "Grey generation care home", "elderly*care plus","golden(agers care service required",
#                  "aging care home", "retirees care reimbirsement", "third age population care community home",
#                  "third age community care home","age related care home","care for old","care for older people",
#                  "senior assistance required","Daycare@expense reimbursement","adult daycare required",
#                  "baby care licensed","olders care home required", "oldsters care home","geriatric care home",
#                  "contentPage 2023 Eldercare!!!!!!!!!!!!!****@_Subsidy","Elder statesmen care",
#                  "Elder women care","Silver generation care",
#                  "contentPage {}[]/\|?><,.;:!@#+\t\n\r\f\v 2023 Elder care Subsidy","gerontology care",
#                  "Elderly Care Plus Information", "care for older people","age related care" ]

# len(list_of_texts)

28

### Child care prediction

In [61]:
# list_of_texts_cc = ["!@#$%^&*()_+"," ","_","","","HmCstmChildCarePlusLandingPageOpen", 
#                     "foster care expense reimbursement"
#                     ,"pre school",
#                  "foster care reimbursement","require nursery school for child","want after school care",
#                 "infant care plus","want info about day nursery",
#                  "day care reimbirsement", "kindergarten facility required","looking for infant school",
#                 "Day care reimbursement", "Daycare reimbursement", "Daycare expense reimbursement", 
#                     "baby care licensed",
#                   "@#^&*contentPage 2023 children care", "!!!!!!!!!!!!!****@_looking for creche",
#                  "want childcarer","require info about playgroup","want someone for child minding",
#                  "looking for baby sitter", "{}[]/\|?><,.;:!@#+\t\n\r\f\v women for nanny care", 
#                     "someone require for child supervision",
#                  "want someone for toddler care"]
# len(list_of_texts_cc2)

29