In [1]:
import pandas as pd
import re 
import string
from tqdm import tqdm


# Topic model
from bertopic import BERTopic

# Dimension reduction
from umap import UMAP

# Text preprocessiong
import nltk
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()

# Remove stopwords
stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andrewmartinson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/andrewmartinson/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/andrewmartinson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
#Loads the CFPB Complaints Database

df=pd.read_csv('complaints.csv', low_memory=False)
df['Date_received_dt'] = pd.to_datetime(df['Date received'], format='%Y/%m/%d')

In [3]:
#create a smaller dataframe based on dates being used for analysis
df_date = df[(df['Date received'] >= '2015-01-01') & (df['Date received'] <= '2016-12-31')]

In [4]:
#Code written by Yuan Fang prior to project fork
#Identify the 5 banks or financial services companies with the most complaints for comparison
#Don't run if comparing the same banks over a period of time, only run for the first time period
top5=df_date.groupby('Company')['Complaint ID'].count().reset_index(name='count') \
            .sort_values(['count'], ascending=False) \
            .head(5)


#create list of top 5 companies for use in analysis
top_5_list = list(top5['Company'])
df_date = None

In [5]:
#Code written by Yuan Fang prior to project fork
def clean_text(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing text = text.lower()'''
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[x*\/]', '', text) #remove those dates with the format of XX/XX OR XXXX/XX/XX
    text = re.sub('x', '', text)#remove \n
    text = re.sub('\n', '', text)#remove \n
    text = re.sub("[\(\[].*?[\)\]]", "", text) #remove bracketed text that appears like a list
    return text

round1 = lambda x: clean_text(x)

In [6]:
def get_date_df(start_date, end_date):
    df_date = None
    df_date = df[(df['Date received'] >= start_date ) & (df['Date received'] <= end_date)]
    return df_date
   

In [7]:
#Cycle through the bank data for comparison purposes
def extract_topics_UMAP(list_of_companies, df_date, year_label, umap_model):

    for company in tqdm(list_of_companies):
        df_bank = None
        data_clean = None
        topic_model = None
        topics = None
        probabilities = None
        topic_model = None


        df_bank = df_date[df_date['Company'] == company]

        #get rid of rows with null values in the customer narrative
        df_bank = df_bank.dropna(subset=['Consumer complaint narrative'])

        # Text cleaning - lowercase, remove punctuations and remove words with numbers.
        data_clean = pd.DataFrame(df_bank['Consumer complaint narrative'].apply(round1)) 

        # Remove stopwords
        # stopwords removal approach sourced from 
        # https://medium.com/grabngoinfo/topic-modeling-with-deep-learning-using-python-bertopic-cf91f5676504
        data_clean['narrative_without_stopwords'] = data_clean['Consumer complaint narrative'].apply(lambda x: ' '.join([w for w in x.split() if w.lower() not in stopwords]))# Lemmatization
        data_clean['narrative_lemmatized'] = data_clean['narrative_without_stopwords'].apply(lambda x: ' '.join([wn.lemmatize(w) for w in x.split() if w not in stopwords]))# Take a look at the data

        #Must reset index to otherwise BERTopic will throw errors
        data_clean.reset_index(drop=True, inplace=True)

        # Initiate BERTopic
        #topic_model = BERTopic(language="english", calculate_probabilities=True).fit(data_clean['narrative_without_stopwords'])
        topic_model = BERTopic(umap_model=umap_model, language="english", calculate_probabilities=True).fit(data_clean['narrative_without_stopwords'])
        
        
        #get the topics from the model
        tm_df = topic_model.get_topic_info()

        #save the topics into a csv file
        filenme = company+'UMAP_topics_'+year_label
        savepath = filenme+'.csv'
        tm_df.to_csv(path_or_buf=savepath)

        #save the model for later use
        topic_model.save(company+'_'+year_label+'_UMAP_dimension_reduction')


        

In [8]:
start_date = '2015-01-01'
end_date = '2016-12-31'
year_label = 'n_neighbors15_UMAP_2015_2016'

# Initiate UMAP for dimensionality reduction
# Experimented with out values for n_neighbors, but results were not as good
umap_model = None
umap_model = UMAP(n_neighbors=15,
                  n_components=5, 
                  min_dist=0.0,
                  metric='cosine', 
                  random_state=42)

df_date = get_date_df(start_date, end_date)


# Get the UMAP BERTopics
extract_topics_UMAP(top_5_list, df_date, year_label, umap_model)

  self._set_arrayXarray(i, j, x)
 20%|████████▊                                   | 1/5 [05:17<21:09, 317.33s/it]


KeyboardInterrupt: 

In [10]:
start_date = '2018-01-01'
end_date = '2019-12-31'
year_label = 'n_neighbors15_UMAP_2018-2019'

df_date = get_date_df(start_date, end_date)

# Get the UMAP BERTopics
extract_topics_UMAP(top_5_list, df_date, year_label, umap_model)

  0%|                                                     | 0/5 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  self._set_arrayXarray(i, j, x)
 20%|████████▏                                | 1/5 [23:02<1:32:11, 1382.97s/it]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  self._set_arrayXarray(i, j, x)
 40%|████████████████▍                        | 2/5 [42:14<1:02:20, 1246.78s/it]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  self._set_arrayXarray(i, j, x)
100%|██████████████████████████████████████████| 5/5 [1:10:00<00:00, 840.18s/it]


In [66]:
#Load the models for further evaluation
loaded_model = BERTopic.load('BANK OF AMERICA, NATIONAL ASSOCIATION_n_neighbors15_UMAP_2018-2019_UMAP_dimension_reduction')

In [67]:
#.get_topic_info() shows the topics the model extracted based on the training set
loaded_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,2623,-1_xxxx_account_bank_america
1,0,765,0_card_xxxx_charges_claim
2,1,762,1_loan_mortgage_xxxx_home
3,2,227,2_check_hold_funds_deposited
4,3,216,3_account_closed_open_bank
5,4,145,4_fraud_account_bank_xxxx
6,5,99,5_overdraft_fees_fee_charged
7,6,94,6_identity_theft_report_credit
8,7,87,7_closed_credit_card_account
9,8,81,8_reporting_credit_report_creditor


In [68]:
#.get_topic(x) shows the top 10 terms associated with a topic from the topic list
loaded_model.get_topic(2)

[('check', 0.07572798411348867),
 ('hold', 0.05727821800631701),
 ('funds', 0.04453700598985412),
 ('deposited', 0.04268783578897484),
 ('deposit', 0.03684374997001089),
 ('available', 0.02708846098733461),
 ('bank', 0.024463523010972182),
 ('money', 0.022004166844711227),
 ('account', 0.019376350810847424),
 ('atm', 0.019148221485355037)]

In [69]:
#.get_representative_docs(topic=X) will display a sampling of representative text for the topic
loaded_model.get_representative_docs(topic=2)

['deposited XXXX check checking account Bank America bank put business day hold deposit make least XXXX dollars available business days believe violation epedited funds availability act entire check hold total days including weekends good standings bank',
 'deposited check company I’ve worked amount Bank America account option deposit due possible received placed hold seven business money name',
 'reference Epedited Funds Availability Act Friday deposited check Bank America Branch check amount drawn Retirement Bank XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX cleared end check called Bank America Morning asked check showing yet told wont get anything days XXXXXXXX process end asked reference Epedited Funds Availability Act cleared XXXX XXXX told Act called back later speak another Customer Service Representative said Bank America doesnt participate said one Law']