In [2]:
# !pip install pandas-profiling

In [3]:
# !pip install spacy

In [4]:
# !pip install sentence_transformers --index-url=https://artifactory.alight.com/artifactory/api/pypi/python-pypi-remote/simple --trusted-host=artifactory.alight.com

In [5]:
# !pip install en_core_web_sm-3.1.0-py3-none-any.whl

In [2]:
import spacy
import en_core_web_sm

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import cosine_distances

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')

nlp = spacy.load("en_core_web_sm")

In [3]:
df_combined_web_iva_search = pd.read_csv("s3://adl-core-sagemaker-studio/external/IVA/combined_new_adult-child_outer_new.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
df_combined_web_iva_search[['input','search_text','page_name']].head()

Unnamed: 0,input,search_text,page_name
0,step child a dependent?,,
1,how do i change my dependent daycare deduction,,
2,Child last name change,,
3,i am trying to add my children as beneficiarie...,,
4,how much timeoff do i get after the birth of m...,,


In [5]:
df_iva = pd.read_csv("s3://adl-core-sagemaker-studio/external/Deepali/IVA_cleaned_labelled(session_id_added).csv")

In [7]:
# df_iva.columns

In [8]:
df_iva1 = df_iva.drop(['Unnamed: 0','entry_id','client_id','person_internal_id','next_unit_hit',
             'previous_unit_hit','response_text','session_id'], axis=1)

In [9]:
df_iva1.head()

Unnamed: 0,input_orig,labels,input_cleaned,input_cleaned_dl,unit_name
0,Open enrollment,Enrollment,open enrol,open enrollment,Annual Enrollment Clarifier
1,enroll in hra,HRA,enrol hra,enroll in hra,Health Reimbursement Account Clarifier
2,TIRE DISCOUNT,Discounts Issue,tire discount,tire discount,Discounts Clarifier
3,eligible,HSA related,elig,eligible,Health Savings Account (HSA) Eligible Expenses
4,Need to update my mail address,General Acount issue,need updat mail address,need to update my mail address,Manage Address


In [10]:
df_iva2 = df_iva1.drop(['input_cleaned','input_cleaned_dl'], axis=1)

In [11]:
df_iva2.head()

Unnamed: 0,input_orig,labels,unit_name
0,Open enrollment,Enrollment,Annual Enrollment Clarifier
1,enroll in hra,HRA,Health Reimbursement Account Clarifier
2,TIRE DISCOUNT,Discounts Issue,Discounts Clarifier
3,eligible,HSA related,Health Savings Account (HSA) Eligible Expenses
4,Need to update my mail address,General Acount issue,Manage Address


In [12]:
df_iva3 = pd.read_csv("s3://adl-core-sagemaker-studio/external/Deepali/IVA_cleaned_labelled_merged.csv")

In [13]:
df_iva3.head()

Unnamed: 0,client_id,person_internal_id,session_id,input_orig,response_text,unit_name
0,245,342020022,13134207,does my fsa carry over,This information isn't available yet.,Flexible Spending Account (FSA) Rollover
1,936,12351089,13134208,hi | leave/time off,Hi there. What would you like help with today?...,Hello | Paid Time Off Clarifier
2,936,32861032,13134215,How much does covid pay cover,For possible impacts to your benefits related ...,Natural Disaster
3,5888,379001056,13134216,I want to roll my future builder into an ira |...,It looks like you're asking about your FutureB...,401k/403b/457 Clarifier | I Don't Know
4,1040,147720041,13134224,cancel dental,There are 2 ways to cancel your benefits cover...,Cancel Coverage


In [14]:
df_iva3 = df_iva3.rename(columns={'input_orig':'input_orig_1'})

In [15]:
df_iva3 = df_iva3.drop(['client_id','person_internal_id','session_id','response_text','unit_name'], axis=1)

In [17]:
# df_iva3['input_orig_1'].head(10).to_list()

In [18]:
# Split the 'col' column using '|' delimiter
df_iva3['input_orig_1'] = df_iva3['input_orig_1'].str.split('|')

In [19]:
# Reset the index
df_iva3 = df_iva3.reset_index(drop=True)

In [20]:
df_concat = pd.concat([df_combined_web_iva_search, df_iva2, df_iva3], axis=1)

In [21]:
df_concat2 = df_concat[['input','input_orig_1','input_orig','search_text','page_name','labels']]

In [22]:
# select the columns to stack
cols_to_stack = ['input', 'input_orig_1', 'input_orig']

# stack the columns using melt
stacked = pd.melt(df_concat2, id_vars=['search_text', 'page_name', 'labels'], 
                  value_vars=cols_to_stack, var_name='stacked_cols', value_name='stacked_input')

In [41]:
stacked.head()

Unnamed: 0,search_text,page_name,labels,stacked_cols,stacked_input
0,,,Enrollment,input,step child a dependent?
1,,,HRA,input,how do i change my dependent daycare deduction
2,,,Discounts Issue,input,Child last name change
3,,,HSA related,input,i am trying to add my children as beneficiarie...
4,,,General Acount issue,input,how much timeoff do i get after the birth of m...


In [42]:
# use applymap to check the type of each element in the DataFrame
list_cols = stacked.applymap(lambda x: isinstance(x, list)).any()

# print the list columns
print(list_cols[list_cols == True])

stacked_input    True
dtype: bool


In [48]:
stacked['stacked_input'][8]

'benefits for eligibility for children'

In [44]:
stacked.shape

(6990198, 5)

In [49]:
stacked.dropna(subset=['search_text','page_name','stacked_input'], how='all', inplace=True)

In [52]:
stacked['stacked_input'] = stacked['stacked_input'].apply(str)
stacked.drop_duplicates(inplace=True)


In [53]:
stacked.head()

Unnamed: 0,search_text,page_name,labels,stacked_cols,stacked_input,stacked_input1
0,,,Enrollment,input,step child a dependent?,step child a dependent?
1,,,HRA,input,how do i change my dependent daycare deduction,how do i change my dependent daycare deduction
2,,,Discounts Issue,input,Child last name change,Child last name change
3,,,HSA related,input,i am trying to add my children as beneficiarie...,i am trying to add my children as beneficiarie...
4,,,General Acount issue,input,how much timeoff do i get after the birth of m...,how much timeoff do i get after the birth of m...


In [54]:
stacked = stacked.drop(['stacked_cols','stacked_input1'], axis=1)

In [55]:
stacked.shape

(3544788, 4)

In [56]:
import string
import nltk
nltk.data.path.append('../../nltk_data')
from nltk.corpus import stopwords
# nltk.download('stopwords')


In [57]:
def clean_text(df, text_cols):
    # Create a new dataframe to hold the cleaned text columns
    cleaned_df = pd.DataFrame()
    
    # Define the list of stopwords
    stop_words = set(stopwords.words('english'))
    
    # Clean each text column and add it to the cleaned dataframe
    for text_col in text_cols:
        text_list = df[text_col].tolist()
        text_list = [str(text) for text in text_list]
        text_list = [text if text.strip() and not
                     set(text).issubset(set(string.punctuation + string.whitespace)) else '' 
                     for text in text_list]
        text_list = [x.lower() for x in text_list]
        translator = str.maketrans(string.punctuation + string.digits + "_", " " * len(
            string.punctuation + string.digits + "_"))
        cleaned_list = []
        for text in text_list:
            cleaned_text = text.translate(translator)
            cleaned_text = ' '.join(cleaned_text.split())
            cleaned_text = ' '.join([word for word in cleaned_text.split() if word not in stop_words])
            cleaned_list.append(cleaned_text)
        cleaned_df[text_col] = cleaned_list
    
    # Add the non-text columns to the cleaned dataframe
    for col in df.columns:
        if col not in text_cols:
            cleaned_df[col] = df[col]
    
    return cleaned_df


In [58]:
df_concat3 = clean_text(stacked, text_cols=['stacked_input','search_text','page_name'])
# cleaned_stacked, removed_rows = clean_text(stacked, text_cols=['stacked_input','search_text','page_name'])

In [59]:
df_concat3.head(5)

Unnamed: 0,stacked_input,search_text,page_name,labels
0,step child dependent,,,Enrollment
1,change dependent daycare deduction,,,HRA
2,child last name change,,,Discounts Issue
3,trying add children beneficiaries life ins,,,HSA related
4,much timeoff get birth child,,,General Acount issue


In [60]:
df_concat4 = df_concat3.replace('nan', '')
df_concat4.head()

Unnamed: 0,stacked_input,search_text,page_name,labels
0,step child dependent,,,Enrollment
1,change dependent daycare deduction,,,HRA
2,child last name change,,,Discounts Issue
3,trying add children beneficiaries life ins,,,HSA related
4,much timeoff get birth child,,,General Acount issue


In [61]:
df_concat4['text'] = df_concat4[['stacked_input', 'search_text', 'page_name']].apply(lambda x: ' '.join([str(i) for i in x if not pd.isna(i)]), axis=1)

In [62]:
df_concat4.drop(['stacked_input','search_text','page_name'], inplace=True, axis=1)

In [63]:
df_concat4.head()

Unnamed: 0,labels,text
0,Enrollment,step child dependent
1,HRA,change dependent daycare deduction
2,Discounts Issue,child last name change
3,HSA related,trying add children beneficiaries life ins
4,General Acount issue,much timeoff get birth child


In [64]:
df_concat4.drop_duplicates(subset=['text'], inplace=True)

In [65]:
df_concat4.shape

(2723446, 2)

In [66]:
# Get the value counts of the 'labels' column
label_counts = df_concat4['labels'].value_counts()
print(sum(label_counts>500))
# Filter the dataframe to only include rows where the label count is greater than 10000
df_concat5 = df_concat4[df_concat4['labels'].isin(label_counts[label_counts > 10000].index)]

# Get the shape of the resulting filtered dataframe
df_concat5_shape = df_concat5.shape

120


In [67]:
df_concat5_shape

(844284, 2)

In [68]:
df_concat5['labels'].value_counts()

Other                    233157
Health Benefits          107610
Savings Plan              51320
Login Issue               44497
Dependent issue           40079
Clarifier Issue           39129
General Account issue     35468
Insurance Card            32086
Enrollment                29221
Loan related              23082
HSA related               23059
Pension Issue             22212
Payment Issue             16794
Holiday/Leave Issue       16012
Benefciary Issue          13953
Dental Plan               13088
Coverage Issue            12940
Tax related               12334
IVA help                  12187
Retirement                12042
W-2 Form                  11705
General Acount issue      11335
Claims                    10527
Life Insurance            10415
Direct Deposit            10032
Name: labels, dtype: int64

In [69]:
df_concat4[df_concat4['text']=='nan '].value_counts()

Series([], dtype: int64)

### finding the texts which contain exact phrases from synonyms list

In [3]:
words_3 = ['grand parents','elder','elder women',
 'silver generation','elder',
 'retiree','Pensioner','Mature adults','Octogenarians','Nonagenarians','Centenarians',
'elderly people',
 'senior assistance',
 'grey generation',
 'silver generation',
 'senior health',
 'elderly companion',
 'senior citizen',
 'elder support',
 'elderly',
 'senior members',
 'elder population',
 'elderly residents',
 'senior assistance',
 'grey generation',
 'elder statesmen',
 'elderly',
 'elderly people',
 'elderly residents',
 'elder',
 'elder women',
 'senior','senior citizen',
 'elder generation',
 'gerontology',
 'elderly population',
 'senior members',
 'retirees',
 'elderly population',
 'eldercare',
 'geriatric',
 'elder statesmen',
 'retirees',
 'elder population',
'eldercae', 'eldercarr', 'eldermann',
'eldercre','eldery','elderman','elders','eldercrae']
words_4 = list(set([word.lower() for word in words_3]))
len(words_4)

37

In [72]:
mask = (df_concat4['text'].str.contains(r'\b(' + '|'.join(words_4) + r')\b', case=False, na=False))

df_concat4['category'] = ''
df_concat4.loc[mask, 'category'] = 'Elder care'
# df_combined_web_iva_search.loc[df_combined_web_iva_search['category'] == '', 'category'] = 'Other'

df_concat4.head(5)

  """Entry point for launching an IPython kernel.


Unnamed: 0,labels,text,category
0,Enrollment,step child dependent,
1,HRA,change dependent daycare deduction,
2,Discounts Issue,child last name change,
3,HSA related,trying add children beneficiaries life ins,
4,General Acount issue,much timeoff get birth child,


In [73]:
# df_concat4[df_concat4['category']=='Elder care'].sample(20)

In [74]:
df_concat4[df_concat4['category']=='Elder care'].shape

(45791, 3)

In [4]:
df_concat4.to_parquet('ec_df_concat4.pqt')

NameError: name 'df_concat4' is not defined

In [5]:
df_concat4 = pd.read_parquet('ec_df_concat4.pqt')

In [6]:
df_concat4['category'].value_counts()

              2677655
Elder care      45791
Name: category, dtype: int64

### get text which are similar to phrases in synonnyms list for texts other than which are filtered above

In [7]:
def find_similar_sentences(df, sentences, phrases, threshold=0.90, category_name = 'Elder care'):
    # encode the phrases using the model
    phrase_embeddings = model.encode(phrases, convert_to_tensor=True)
    
    # initialize an empty list to store the similar sentences
    similar_sentences = []
    similar_phrases = []
    
    # iterate over the sentences
    for sentence in sentences:
        # encode the sentence using the model
        sentence_embedding = model.encode(sentence, convert_to_tensor=True)
        # reshape the sentence embedding to a 2D array
        sentence_embedding = sentence_embedding.reshape(1, -1)
        
        # calculate the cosine similarity between the sentence embedding and each phrase embedding
        cosine_scores = 1 - cosine_distances(sentence_embedding, phrase_embeddings)
        
        # convert the cosine similarity scores to a list
        scores_list = cosine_scores.tolist()[0]
        
        # iterate over the phrases and similarity scores and append the sentence to the list if it meets the threshold for at least one phrase
        for phrase, score in zip(phrases, scores_list):
            if score >= threshold:
                similar_sentences.append(sentence)
                similar_phrases.append(phrase)
                break
    
    # convert the list of similar sentences to a set to remove duplicates
    similar_sentences = set(similar_sentences)
    
    # create a new dataframe containing only the rows with text that is in the set of similar sentences
    similar_df = df[df['text'].isin(similar_sentences)]
    similar_df['synonym_phrase'] = similar_phrases
    similar_df['category']=category_name
    return similar_df


In [8]:
# def find_similar_sentences(df, sentences, phrases, threshold=0.90, category_name = 'Elder care'):
#     # encode the phrases using the model
#     phrase_embeddings = model.encode(phrases, convert_to_tensor=True)
    
#     # initialize an empty list to store the similar sentences
#     similar_sentences = []
    
#     # iterate over the sentences
#     for sentence in sentences:
#         # encode the sentence using the model
#         sentence_embedding = model.encode(sentence, convert_to_tensor=True)
#         # reshape the sentence embedding to a 2D array
#         sentence_embedding = sentence_embedding.reshape(1, -1)
        
#         # calculate the cosine similarity between the sentence embedding and each phrase embedding
#         cosine_scores = 1 - cosine_distances(sentence_embedding, phrase_embeddings)
        
#         # convert the cosine similarity scores to a list
#         scores_list = cosine_scores.tolist()[0]
        
#         # iterate over the phrases and similarity scores and append the sentence to the list if it meets the threshold for at least one phrase
#         for phrase, score in zip(phrases, scores_list):
#             if score >= threshold:
#                 similar_sentences.append(sentence)
#                 break
    
#     # convert the list of similar sentences to a set to remove duplicates
#     similar_sentences = set(similar_sentences)
    
#     # create a new dataframe containing only the rows with text that is in the set of similar sentences
#     similar_df = df[df['text'].isin(similar_sentences)]
#     similar_df['category']=category_name
#     return similar_df


In [9]:
# df_concat4 = df_concat4.dropna(subset=['text'])

In [None]:
import time
start_time = time.time()


similar_df = find_similar_sentences(
    df_concat4,
    df_concat4[df_concat4['category']==''].sample(500000, random_state=123)['text'].to_list(),
    words_4,
    threshold=0.90,
)

elapsed_time = time.time() - start_time
print(elapsed_time)

In [None]:
similar_df

In [None]:
similar_df.to_csv('ec_similar_df_90%_outof_500000_samples.csv')

In [None]:
only_EC_df = pd.concat([df_concat4[df_concat4['category']=='Elder care'], 
                       similar_df]).sample(frac=1)

In [None]:
only_EC_df.shape

(1662, 4)

In [None]:
only_EC_df.drop_duplicates(inplace=True)

In [None]:
only_EC_df.shape

In [None]:
non_similar_df = df_concat4[~df_concat4['text'].isin(only_EC_df['text'])]

non_similar_df['category'] = 'Other'

In [None]:
df_concatenated = pd.concat([only_EC_df, non_similar_df]).sample(frac=1).reset_index(drop=True)

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
df_concatenated.shape

In [None]:
df_concatenated[(df_concatenated.text.str.contains('elder')) & (df_concatenated.category=='Other')]

In [None]:
pd.reset_option('display.max_colwidth')

In [None]:
# df_concatenated[df_concatenated.category=='Elder care'].to_excel('fasttext_only_elder_care_training_data.xlsx')

In [None]:
## number og labels in label col where value of category col is Other
df_concatenated[df_concatenated['category'] == 'Other']['labels'].nunique()

In [None]:
df_concatenated2 = df_concatenated.drop('session_start_cst', axis=1)

In [None]:
df_concatenated2['labels'].isnull().sum()

In [None]:
df_concatenated2['category'].isnull().sum()

In [None]:
df_concatenated2['text'].isnull().sum()

NameError: name 'df_concatenated2' is not defined

In [None]:
df_concatenated2['labels'] = df_concatenated2['labels'].fillna('empty')

In [141]:
## get 5000 rows including all labels where category is Other in same proportion as original 

from sklearn.model_selection import StratifiedKFold

# Define the number of folds to use for cross-validation
n_splits = 5

# Create a StratifiedKFold object to generate the cross-validation folds
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Define an empty DataFrame to store the sampled data
Other_sample_df = pd.DataFrame()

# Split the DataFrame into training and testing sets using cross-validation
for train_index, test_index in skf.split(df_concatenated2[df_concatenated2['category'] == 'Other'], 
                                         df_concatenated2[df_concatenated2['category'] == 'Other']['labels']):
    # Obtain a random sample of 5000 rows from the training set
    train_df = df_concatenated2.iloc[train_index]
    train_df_other = train_df[train_df['category'] == 'Other']
    train_df_other_sample = train_df_other.sample(n=5000//n_splits, random_state=42)
    Other_sample_df = pd.concat([Other_sample_df, train_df_other_sample])
    
# Print the value counts of the label column in the original DataFrame and the sample
print('Original dataset:')
print(df_concatenated2[df_concatenated2['category'] == 'Other']['labels'].value_counts(normalize=True))
print('\nSampled dataset:')
print(Other_sample_df['labels'].value_counts(normalize=True))


Original dataset:
Other                 0.227158
Health Benefits       0.101629
Savings Plan          0.042729
Clarifier Issue       0.036718
Dependent issue       0.036023
                        ...   
RMSA                  0.000009
Rebalancing Issue     0.000009
Financial Advisors    0.000009
Facilities Issue      0.000009
Hysterectomy          0.000009
Name: labels, Length: 166, dtype: float64

Sampled dataset:
Other              0.2336
Health Benefits    0.1056
Savings Plan       0.0374
Dependent issue    0.0366
Clarifier Issue    0.0354
                    ...  
Care Issue         0.0002
Plan               0.0002
Brokerage          0.0002
HR related         0.0002
Deferrals          0.0002
Name: labels, Length: 130, dtype: float64




In [145]:
Other_sample_df.head()

Unnamed: 0,labels,text,category
73147,Health Benefits,need find child care resources birth child,Other
92005,Loan related,coverage ending child,Other
93949,Other,checking child care plus wondering reason denied hmcstmchildcarepluslandingpageopen,Other
18560,Form 1095 Issue,child colleague scholarship,Other
51422,Rollovers Clarifier,child goes state college covered insurance,Other


In [147]:
EC_model_train_df = pd.concat([df_concatenated2[df_concatenated2.category=='Elder care'], 
                              Other_sample_df]).sample(frac=1).reset_index(drop=True)#.to_excel('fasttext_EC_model_training_data.xlsx')

In [149]:
EC_model_train_df.head()

Unnamed: 0,labels,text,category
0,Other,hi life event need add children insurance,Other
1,Other,elder care elderly care contentpage health care fsa day care dcap page,Elder care
2,Savings Plan,hi lisa accidently added two sons dependent children removed redo add fix proceed,Other
3,Loan related,service offered hearst help elderly family number,Elder care
4,Other,much plan cost employee children plan,Other


In [150]:
EC_model_train_df.category.value_counts(normalize=True)

Other         0.752899
Elder care    0.247101
Name: category, dtype: float64

In [151]:
EC_model_train_df.shape

(6641, 3)

### text preprocess

In [152]:
import re
import numpy as np

def count_intnt_entits(text):
    doc = nlp(text)
    intents = [token.text for token in doc if token.pos_ == 'VERB']
    entities = [token.text for token in doc if token.pos_ in {'NOUN', 'PROPN', 'ADJ', 'NUM', 'ADV'}]
    return len(intents), len(entities)

def extract_ner_entities(sentence):
    doc = nlp(sentence)
    entities = [(entity.text, entity.label_) for entity in doc.ents]
    return entities

def length_entities(list_entities):
    if (list_entities==np.nan or list_entities==None or list_entities==''):
        return 0
    else:
        return len(list_entities)
    
def filter_named_entities(text):
    # Process the text using Spacy
    doc = nlp(text)
    # Filter out named entities (ORG, PERSON, and GPE tags)
    filtered_words = [token.text for token in doc if token.ent_type_ not in ['ORG', 'PERSON', 'GPE', "LOC", "FAC"]]
    # Join the filtered words back into a string
    filtered_text = ' '.join(filtered_words)
    return filtered_text

In [153]:
import re
    
# list_1 = ['ira','RMD','HRdirect','livechat','what is my hsa','P45','Payslip?',
#     'sps','F80.2','ub','What is YSA','Paystub please','Sh','mfv','C-128','ax','no is hsa','FormL564','HIS','cif','GreT','YSACard',
#     'Heli','RxPCN','403(b)','Hsa yes or no','ypr','Gv','ONA?','What is UHC?','HC-2','uo','what is 4DX?','osh','what is my hsa?',
#     'sPRAVATO','sdr','RMD’s','coverage?How','This is for my hsa','pto?','A&DD','childcareplus','fs','mbi','Is that my lowesbenefit.com',
#     'hra yes','mri?']
# list_2 = [word.lower() for word in list_1]

def text_preprocess(dataframe):
    dataframe = dataframe.drop_duplicates()
    dataframe[['no_of_intents', 'no_of_entities']] = dataframe.apply(lambda x: pd.Series(count_intnt_entits(x['text'])), axis=1)  

    dataframe['ner_enities'] = ''
    dataframe.loc[dataframe['text']!='', 'ner_enities'] = dataframe.loc[dataframe['text']!='', 'text'].apply(extract_ner_entities)
    dataframe['len_ner_enities'] = dataframe['ner_enities'].apply(length_entities)
    dataframe3 = dataframe[dataframe['len_ner_enities']>0]
    dataframe3['text'] = dataframe3['text'].apply(filter_named_entities)
    dataframe6 = pd.concat([dataframe[dataframe['len_ner_enities']==0], dataframe3], axis = 0)
    dataframe6 = dataframe6.drop(['no_of_intents','no_of_entities','ner_enities','len_ner_enities'], axis=1)

    dataframe6['text'] = dataframe6['text'].str.strip()
    
    return dataframe6

In [154]:
df_combined_ec_model_data_2 = text_preprocess(EC_model_train_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[['no_of_intents', 'no_of_entities']] = dataframe.apply(lambda x: pd.Series(count_intnt_entits(x['text'])), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[['no_of_intents', 'no_of_entities']] = dataframe.apply(lambda x: pd.Series(count_intnt_entits(x['text'])), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/in

In [155]:
df_combined_ec_model_data_2.shape

(6539, 3)

In [156]:
df_combined_ec_model_data_2.columns

Index(['labels', 'text', 'category'], dtype='object')

In [157]:
df_combined_ec_model_data_2['category'].value_counts()

Other         4898
Elder care    1641
Name: category, dtype: int64

In [158]:
df_combined_ec_model_data_2.to_excel('final_ec_model_data_v6.xlsx')

In [175]:
df_unseen = pd.read_excel('unseen_data.xlsx')

In [176]:
df_unseen.head()

Unnamed: 0,input
0,TGT
1,WelcomeUserFollowUp
2,Wellness Program Incentive Credit or Rewards
3,Medical Plan Credit
4,WelcomeUser


In [10]:
import pandas as pd
df_combined_ec_model_data_2 = pd.read_excel('final_ec_model_data_v6.xlsx')

In [11]:
df_combined_ec_model_data_2.isnull().sum()

Unnamed: 0     0
labels         0
text          12
category       0
dtype: int64

In [12]:
df_combined_ec_model_data_2 = df_combined_ec_model_data_2.dropna(subset=['text'])

In [18]:
data = df_combined_ec_model_data_2[['text', 'category']].rename(columns={'category':'label'})
dataset = list(data.itertuples(index=False, name=None))

dataset[:1]

[('hi life event need add children insurance', 'Other')]

In [None]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel
from torch.nn.utils.rnn import pad_sequence

class TextClassifier(nn.Module):
    def __init__(self, num_labels):
        super(TextClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the dataset
tokenized_dataset = []
for text, label in dataset:
    input_ids = tokenizer.encode(text, add_special_tokens=True)
    attention_mask = [1] * len(input_ids)
    tokenized_dataset.append((input_ids, attention_mask, label))

# Convert the tokenized dataset into PyTorch tensors
input_ids = pad_sequence([torch.tensor(x[0]) for x in tokenized_dataset], batch_first=True)
attention_mask = pad_sequence([torch.tensor(x[1]) for x in tokenized_dataset], batch_first=True)
labels = torch.tensor([1 if x[2] == "Elder care" else 0 for x in tokenized_dataset])

# Define the training parameters
batch_size = 2
num_epochs = 10
learning_rate = 1e-5

# Create the DataLoader
dataset = torch.utils.data.TensorDataset(input_ids, attention_mask, labels)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Define the model, optimizer, and loss function
model = TextClassifier(num_labels=2)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

# Train the model
for epoch in range(num_epochs):
    for batch in dataloader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
    print("Epoch {}/{} complete. Loss: {}".format(epoch+1, num_epochs, loss.item()))

In [20]:
# # Save the model
# torch.save(model.state_dict(), "Bert_EC_model/ec_model_10_epoch.pth")

In [20]:
# # Use the trained model to make predictions
# input_text = "My grandmother needs elder care services."
# input_ids = torch.tensor([tokenizer.encode(input_text, add_special_tokens=True)])
# attention_mask = torch.tensor([[int(token_id > 0) for token_id in input_ids[0]]])
# logits = model(input_ids, attention_mask)
# probs = nn.functional.softmax(logits, dim=-1)
# predicted_label = torch.argmax(probs, dim=-1)

# # Print the predicted label
# if predicted_label == 1:
#     print("Elder care")
# else:
#     print("Other")

Elder care


In [33]:
def count_intnt_entits(text):
    if str(text).isnumeric():
        return 0,0    
    try:
        doc = nlp(str(text))
        intents = [token.text for token in doc if token.pos_ == 'VERB']
        entities = [token.text for token in doc if token.pos_ in {'NOUN', 'PROPN', 'ADJ', 'NUM', 'ADV'}]
    except:
        print(text)
        raise
    return len(intents), len(entities)

def extract_ner_entities(sentence):
    doc = nlp(str(sentence))
    entities = [(entity.text, entity.label_) for entity in doc.ents]
    return entities

def length_entities(list_entities):
    if (list_entities==np.nan or list_entities==None or list_entities==''):
        return 0
    else:
        return len(list_entities)
    
def filter_named_entities(text):
    # Process the text using Spacy
    doc = nlp(str(text))
    # Filter out named entities (ORG, PERSON, and GPE tags)
    filtered_words = [token.text for token in doc if token.ent_type_ not in ['ORG', 'PERSON', 'GPE', "LOC", "FAC"]]
    # Join the filtered words back into a string
    filtered_text = ' '.join(filtered_words)
    return filtered_text

def text_preprocess(col):
    df = pd.DataFrame({ 'text': col })
    df = df.drop_duplicates()
    df['text'] = df['text'].str.replace('\d+', '')
    df[['no_of_intents', 'no_of_entities']] = df.apply(lambda x: pd.Series(count_intnt_entits(x['text'])), axis=1)  

    df['ner_enities'] = ''
    df.loc[df['text']!='', 'ner_enities'] = df.loc[df['text']!='', 'text'].apply(extract_ner_entities)
    df['len_ner_enities'] = df['ner_enities'].apply(length_entities)
    df3 = df[df['len_ner_enities']>0]
    df3['text'] = df3['text'].apply(filter_named_entities)
    df6 = pd.concat([df[df['len_ner_enities']==0], df3], axis = 0)
    df6 = df6.drop(['no_of_intents','no_of_entities','ner_enities','len_ner_enities'], axis=1)

    df6['text'] = df6['text'].str.strip()
    
    return df6['text'].to_list()

def clean_text(text_list):
    # Clean the text
    text_list = text_preprocess(text_list)
    #text_list = [text for text in text_list if text.strip() and not set(text).issubset(set(string.punctuation + string.whitespace))]
    text_list1 = []
    for text in text_list:
        if isinstance(text, str):
            if text.strip() and not set(text).issubset(set(string.punctuation + string.whitespace)):
                text_list1.append(text)
            
    text_list = text_list1
    
    text_list = [x.lower() for x in text_list]
    # Define a translation table to replace punctuation and special characters with empty string
    translator = str.maketrans(string.punctuation + "_", " " * len(string.punctuation + "_"))
    # Loop through each text in the list and clean it
    cleaned_list = []
    for text in text_list:
        # Replace punctuation and special characters with empty string
        cleaned_text = text.translate(translator)
        # Remove any remaining special characters, punctuation, or whitespaces
        cleaned_text = ' '.join(cleaned_text.split())
        cleaned_list.append(cleaned_text)
    
    return cleaned_list

In [78]:
import pandas as pd
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification

# Load the saved model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model.load_state_dict(torch.load("Bert_EC_model/ec_model_10_epoch.pth"))

# Load the input Excel file
df = pd.read_excel('unseen_data.xlsx')
cleaned_text_list = clean_text(df['text'].to_list())
# Make predictions for each text in the Excel file
predictions = []
for text in cleaned_text_list:
    input_ids = torch.tensor([tokenizer.encode(text, add_special_tokens=True)])
    attention_mask = torch.tensor([[int(token_id > 0) for token_id in input_ids[0]]])
    logits = model(input_ids, attention_mask).logits
    probs = nn.functional.softmax(logits, dim=-1)
    predicted_label = torch.argmax(probs, dim=-1)
    if predicted_label == 1:
        predictions.append((text, 'Elder care', probs[0][1].item()))
    else:
        predictions.append((text, 'Other', probs[0][0].item()))


# Save the predictions to a new file
df_pred = pd.DataFrame(predictions, columns=['text', 'prediction', 'probability'])
df_pred.to_excel('bert_ec_pred.xlsx', index=False)

In [81]:
list_of_texts = ["senior-citizen?care expense reimbursement", 
                 "elderly*care plus",
                 "aging care home", "retirees care reimbirsement", 
                 "senior assistance required","Daycare@expense reimbursement",
                 "baby care licensed", "oldsters care home","geriatric care home",
                 "contentPage 2023 Eldercare!!!!!!!!!!!!!****@_Subsidy","Elder statesmen care",
                 "Elder women care","Silver generation care",
                 "contentPage {}[]/\|?><,.;:!@#+\t\n\r\f\v 2023 Elder care Subsidy","gerontology care",
                 "Elderly Care Plus Information"]

import torch.nn.functional as F

cleaned_text_list = clean_text(list_of_texts)

predictions = []
for text in cleaned_text_list:
    # Skip empty input strings
    if not text:
        continue
    input_ids = torch.tensor([tokenizer.encode(text, add_special_tokens=True)])
    attention_mask = torch.tensor([[int(token_id > 0) for token_id in input_ids[0]]])
    token_type_ids = torch.tensor([[0] * len(input_ids[0])]) # all tokens belong to the same segment in our case
    logits = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids).logits

    probs = F.softmax(logits, dim=-1)
    predicted_label = torch.argmax(probs, dim=-1)
    if predicted_label == 1:
        predictions.append((text, 'Elder care', probs[0][1].item()))
    else:
        predictions.append((text, 'Other', probs[0][0].item()))

df_pred = pd.DataFrame(predictions, columns=['text', 'prediction', 'probability'])
df_pred


  df['text'] = df['text'].str.replace('\d+', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['text'] = df3['text'].apply(filter_named_entities)


Unnamed: 0,text,prediction,probability
0,senior citizen care expense reimbursement,Elder care,0.999973
1,elderly care plus,Elder care,0.999985
2,aging care home,Elder care,0.999983
3,retirees care reimbirsement,Elder care,0.999841
4,senior assistance required,Elder care,0.999977
5,daycare expense reimbursement,Other,0.999982
6,baby care licensed,Other,0.999976
7,oldsters care home,Other,0.991685
8,contentpage eldercare subsidy,Elder care,0.999982
9,elder statesmen care,Elder care,0.999982


In [68]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,category
0,0,i need a phone number for alight,Other
1,1,could pension plan selection be changed from j...,Other
2,2,where do i add my bank deposit info,Other
3,3,virgin pluse,Other
4,4,why att sent me enroll sheet,Other


In [70]:
# Load the saved model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model.load_state_dict(torch.load("Bert_EC_model/ec_model_10_epoch.pth"))

# Load the input Excel file
df = pd.read_excel('labelled_unseen_data.xlsx')
cleaned_text_list = df['text'].to_list()
category_list = df['category'].to_list()

# Make predictions for each text in the Excel file
predictions = []
for i in range(len(cleaned_text_list)):
    text = cleaned_text_list[i]
    category = category_list[i]
    input_ids = torch.tensor([tokenizer.encode(text, add_special_tokens=True)])
    attention_mask = torch.tensor([[int(token_id > 0) for token_id in input_ids[0]]])
    logits = model(input_ids, attention_mask).logits
    probs = nn.functional.softmax(logits, dim=-1)
    predicted_label = torch.argmax(probs, dim=-1)
    if predicted_label == 1:
        predictions.append((text, 'Elder care', category, probs[0][1].item()))
    else:
        predictions.append((text, 'Other', category, probs[0][0].item()))

# Save the predictions to a new file
df_pred = pd.DataFrame(predictions, columns=['text', 'prediction', 'category', 'probability'])
df_pred.to_excel('bert_labelled_ec_pred.xlsx', index=False)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

AttributeError: 'SequenceClassifierOutput' object has no attribute 'softmax'

### convert saved model .bin and config.json to .pkl compatible files

In [None]:
# convert saved model .bin and config.json to .pkl compatible files
from transformers import AutoConfig, AutoModelForSequenceClassification
import torch
import pickle

# Load the model configuration from the config.json file
config = AutoConfig.from_pretrained('EC_model_outer_combined_texts_data_v6', num_labels=2)

# Load the model from the binary file using the configuration
model = AutoModelForSequenceClassification.from_pretrained('EC_model_outer_combined_texts_data_v6', config=config)

# Save the model and configuration as a pickle file
with open('EC_model_outer_combined_texts_data_v6/EC_model.pkl', 'wb') as f:
    pickle.dump((config, model.state_dict()), f)

## tagging the unseen_data for analysisng the predictions

### finding the texts which contain exact phrases from synonyms list

In [41]:
words_3 = ['older parent','older people','grand parents','elder','old parents','elder women',
 'silver generation','aged people', 'older women','older men','old age home','elder',
 'aged',
 'elderly people',
 'senior assistance',
 'aging-in-place',
 'aged population',
 'golden agers',
 'aging in place',
 'grey generation',
 'silver generation',
 'senior health',
 'aged population',
 'elderly companion',
 'golden agers',
 'senior citizen',
 'elder support',
 'elderly',
 'senior members',
 'elder population',
 'elderly residents',
 'senior assistance',
 'oldsters',
 'grey generation',
 'aging population',
 'elder statesmen',
 'elderly',
 'elderly people',
 'aging',
 'elderly residents',
 'elder',
 'elder women',
 'senior',
 'elder generation',
 'gerontology',
 'elderly population',
 'senior members',
 'retirees',
 'elderly population',
 'eldercare',
 'geriatric',
 'elder statesmen',
 'age related',
 'retirees',
 'third age population',
 'aging population',
 'elder population',
 'oldsters',
 'third age population','eldercae', 'eldercarr', 'eldermann', 
'aged home','eldercre','eldery','elderman','elders','eldercrae',]
words_4 = list(set([word.lower() for word in words_3]))
len(words_4)

49

In [42]:
df_cleaned = pd.DataFrame()
df_cleaned['text'] =cleaned_text_list
df_cleaned.head(5)

Unnamed: 0,text
0,welcomeuserfollowup
1,medical plan credit
2,welcomeuser
3,general purpose loans
4,chiropractor visits


In [43]:
mask = (df_cleaned['text'].str.contains(r'\b(' + '|'.join(words_4) + r')\b', case=False, na=False))

df_cleaned['category'] = ''
df_cleaned.loc[mask, 'category'] = 'Elder care'
# df_combined_web_iva_search.loc[df_combined_web_iva_search['category'] == '', 'category'] = 'Other'

df_cleaned.head(5)

  mask = (df_cleaned['text'].str.contains(r'\b(' + '|'.join(words_4) + r')\b', case=False, na=False))


Unnamed: 0,text,category
0,welcomeuserfollowup,
1,medical plan credit,
2,welcomeuser,
3,general purpose loans,
4,chiropractor visits,


In [45]:
df_cleaned[df_cleaned['category']=='Elder care'].sample(1)

Unnamed: 0,text,category
5666,ongoing elderly care,Elder care


In [46]:
df_cleaned[df_cleaned['category']=='Elder care'].shape

(13, 2)

### get text which are similar to phrases in synonnyms list for texts other than which are filtered above

In [50]:
model = SentenceTransformer('bert-base-nli-mean-tokens')

In [51]:
def find_similar_sentences(df, sentences, phrases, threshold=0.95, category_name = 'Elder care'):
    # encode the phrases using the model
    phrase_embeddings = model.encode(phrases, convert_to_tensor=True)
    
    # initialize an empty list to store the similar sentences
    similar_sentences = []
    
    # iterate over the sentences
    for sentence in sentences:
        # encode the sentence using the model
        sentence_embedding = model.encode(sentence, convert_to_tensor=True)
        # reshape the sentence embedding to a 2D array
        sentence_embedding = sentence_embedding.reshape(1, -1)
        
        # calculate the cosine similarity between the sentence embedding and each phrase embedding
        cosine_scores = 1 - cosine_distances(sentence_embedding, phrase_embeddings)
        
        # convert the cosine similarity scores to a list
        scores_list = cosine_scores.tolist()[0]
        
        # iterate over the phrases and similarity scores and append the sentence to the list if it meets the threshold for at least one phrase
        for phrase, score in zip(phrases, scores_list):
            if score >= threshold:
                similar_sentences.append(sentence)
                break
    
    # convert the list of similar sentences to a set to remove duplicates
    similar_sentences = set(similar_sentences)
    
    # create a new dataframe containing only the rows with text that is in the set of similar sentences
    similar_df = df[df['text'].isin(similar_sentences)]
    similar_df['category']=category_name
    return similar_df

In [52]:
similar_df_unseen = find_similar_sentences(df_cleaned, 
                                    df_cleaned[df_cleaned['category']=='']['text'].to_list(), 
                                    words_4)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_df['category']=category_name


In [53]:
similar_df_unseen

Unnamed: 0,text,category
7095,retiremen,Elder care


In [54]:
unseen_EC_df = pd.concat([df_cleaned[df_cleaned['category']=='Elder care'], 
                       similar_df_unseen]).sample(frac=1)

In [55]:
unseen_EC_df.shape

(14, 2)

In [56]:
unseen_EC_df.drop_duplicates(inplace=True)

In [57]:
unseen_EC_df.shape

(14, 2)

In [58]:
# filter out the rows with similar text from the original DataFrame
non_similar_unseen_df = df_cleaned[~df_cleaned['text'].isin(unseen_EC_df['text'])]

# sample twice as many rows from the non-similar DataFrame as there are in the similar DataFrame
# non_similar_df = non_similar_df.sample(n=only_EC_df.shape[0]*2)
non_similar_unseen_df['category'] = 'Other'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_similar_unseen_df['category'] = 'Other'


In [59]:
df_unseen_concatenated = pd.concat([unseen_EC_df, non_similar_unseen_df]).sample(frac=1).reset_index(drop=True)
# df_concatenated = df_concatenated
# df_concatenated.drop(columns=['input', 'search_text', 'page_name'], inplace=True)

In [60]:
pd.set_option('display.max_colwidth', None)

In [61]:
df_unseen_concatenated.shape

(12584, 2)

In [62]:
df_unseen_concatenated[(df_unseen_concatenated.text.str.contains('elder')) & (df_unseen_concatenated.category=='Other')]

Unnamed: 0,text,category


In [63]:
pd.reset_option('display.max_colwidth')

In [64]:
df_unseen_concatenated.to_excel('labelled_unseen_data.xlsx')