In [10]:
import pandas as pd

import re 
import string

import pandas as pd

from tqdm import tqdm


# Topic model
from bertopic import BERTopic


# Text preprocessiong
import nltk
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()

# Remove stopwords
stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andrewmartinson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/andrewmartinson/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/andrewmartinson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
#Loads the CFPB Complaints Database

df=pd.read_csv('complaints.csv', low_memory=False)
df['Date_received_dt'] = pd.to_datetime(df['Date received'], format='%Y/%m/%d')

In [3]:
#create a smaller dataframe based on dates being used for analysis
df_date = df[(df['Date received'] >= '2015-01-01') & (df['Date received'] <= '2016-12-31')]

In [4]:
#code written by Yuan Fang prior to project fork
#Identify the 5 banks or financial services companies with the most complaints for comparison
#Don't run if comparing the same banks over a period of time, only run for the first time period
top5=df_date.groupby('Company')['Complaint ID'].count().reset_index(name='count') \
            .sort_values(['count'], ascending=False) \
            .head(5)


#create list of top 5 companies for use in analysis
top_5_list = list(top5['Company'])
df_date = None

In [5]:
#show counts used in final report
top5

Unnamed: 0,Company,count
1195,"EQUIFAX, INC.",27976
1278,Experian Information Solutions Inc.,24777
3320,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",23152
3613,WELLS FARGO & COMPANY,20482
436,"BANK OF AMERICA, NATIONAL ASSOCIATION",19617


In [6]:
#code written by Yuan Fang prior to project fork
def clean_text(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing text = text.lower()'''
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[x*\/]', '', text) #remove those dates with the format of XX/XX OR XXXX/XX/XX
    text = re.sub('x', '', text)#remove \n
    text = re.sub('\n', '', text)#remove \n
    text = re.sub("[\(\[].*?[\)\]]", "", text) #remove bracketed text that appears like a list
    return text

round1 = lambda x: clean_text(x)

In [7]:
def get_date_df(start_date, end_date):
    df_date = None
    df_date = df[(df['Date received'] >= start_date ) & (df['Date received'] <= end_date)]
    return df_date

In [8]:
#Cycle through the bank data for comparison purposes
def extract_topics(list_of_companies, df_date, year_label):

    for company in tqdm(list_of_companies):
        df_bank = None
        data_clean = None
        topic_model = None
        topics = None
        probabilities = None
        topic_model = None


        df_bank = df_date[df_date['Company'] == company]

        #get rid of rows with null values in the customer narrative
        df_bank = df_bank.dropna(subset=['Consumer complaint narrative'])

        # Text cleaning - lowercase, remove punctuations and remove words with numbers.
        data_clean = pd.DataFrame(df_bank['Consumer complaint narrative'].apply(round1)) 

        # Remove stopwords
        # stopwords removal approach sourced from 
        # https://medium.com/grabngoinfo/topic-modeling-with-deep-learning-using-python-bertopic-cf91f5676504
        data_clean['narrative_without_stopwords'] = data_clean['Consumer complaint narrative'].apply(lambda x: ' '.join([w for w in x.split() if w.lower() not in stopwords]))# Lemmatization
        data_clean['narrative_lemmatized'] = data_clean['narrative_without_stopwords'].apply(lambda x: ' '.join([wn.lemmatize(w) for w in x.split() if w not in stopwords]))# Take a look at the data

        #Must reset index to otherwise BERTopic will throw errors
        data_clean.reset_index(drop=True, inplace=True)

        # Initiate BERTopic
        #Straight out of the box model
        topic_model = BERTopic(language="english", calculate_probabilities=True).fit(data_clean['narrative_without_stopwords'])

        #get the topics from the model
        tm_df = topic_model.get_topic_info()

        #save the topics into a csv file
        filenme = company+'_topics_'+year_label
        savepath = filenme+'.csv'
        tm_df.to_csv(path_or_buf=savepath)

        #save the model for later use
        topic_model.save(company+year_label+'BERT_model')
    
    

In [11]:
start_date = '2015-01-01'
end_date = '2016-12-31'
year_label = '2015-2016'

df_date = get_date_df(start_date, end_date)

# Get the BERTopics
extract_topics(top_5_list, df_date, year_label)



  self._set_arrayXarray(i, j, x)
 20%|████████▊                                   | 1/5 [05:02<20:11, 302.91s/it]


KeyboardInterrupt: 

In [9]:
start_date = '2018-01-01'
end_date = '2019-12-31'
year_label = '2018-2019'

df_date = get_date_df(start_date, end_date)

# Get the regular BERTopics
extract_topics(top_5_list, df_date, year_label)


  0%|                                                     | 0/5 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  self._set_arrayXarray(i, j, x)
 20%|████████▏                                | 1/5 [22:14<1:28:59, 1335.00s/it]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  self._set_arrayXarray(i, j, x)
 40%|████████████████▍                        | 2/5 [40:52<1:00:21, 1207.31s/it]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  self._set_arrayXarray(i, j, x)
100%|██████████████████████████████████████████| 5/5 [1:08:06<00:00, 817.24s/it]
