In [16]:
import re
import warnings
from tqdm import tqdm
import spacy
import pandas as pd
from Analysing_and_Cleaning import df


In [17]:
# Load the spaCy model
nlp = spacy.load('en_core_web_sm')
stop_words = nlp.Defaults.stop_words

warnings.filterwarnings('ignore')

# Enable the progress bar for pandas
tqdm.pandas()

# change the display properties of pandas to max
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Prepare the text for classification

In [19]:
# Load English language model in spaCy
nlp = spacy.load('en_core_web_sm')

def preprocess_text(text):
    text = text.lower()  # Convert to lower case
    text = re.sub(r'^\[[\w\s]\]+$', ' ', text)  # Remove text in square brackets
    text = re.sub(r'[^\w\s]', ' ', text)  # Remove punctuation
    text = text.replace('\n', '')  # Remove newline characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)   # Remove special characters and numbers
    text = re.sub(r'(.)\1+|\b\w\b', r'\1', text)  # Remove repeated characters and single letters
    
    # Tokenize and filter by POS tags using spaCy
    doc = nlp(text)
    filtered_tokens = [token for token in doc if token.pos_ in {'PRON', 'NOUN', 'VERB', 'ADJ'}]
    
    # Lemmatize the text using spaCy
    lemmatized_text = ' '.join([token.lemma_ for token in filtered_tokens])
    
    return lemmatized_text

# Preprocess text columns
df['complaint_what_happened_lemmatized'] = df['complaint_what_happened'].apply(lambda x: preprocess_text(x))
# Remove the 'x' 'xx' 'xxx' and 'xxxx' from the text columns
df['complaint_what_happened_lemmatized'] = df['complaint_what_happened_lemmatized'].str.replace(r'x{1,4}', '', regex=True)

# Function to remove stopwords
def remove_stopwords(text):
    # Load spaCy stopwords
    stopwords = spacy.lang.en.stop_words.STOP_WORDS # type: ignore
    # Tokenize the text using spaCy
    doc = nlp(text)
    # Filter out tokens that are not stopwords
    filtered_text = ' '.join([token.text for token in doc if token.text.lower() not in stopwords])
    return filtered_text

# Apply remove_stopwords function to create a new column with text without stopwords
df['complaint_what_happened_without_stopwords'] = df['complaint_what_happened_lemmatized'].apply(remove_stopwords)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20930 entries, 1 to 78312
Data columns (total 6 columns):
 #   Column                                     Non-Null Count  Dtype 
---  ------                                     --------------  ----- 
 0   product                                    20930 non-null  object
 1   complaint_what_happened                    20930 non-null  object
 2   category                                   20930 non-null  object
 3   category_encoded                           20930 non-null  int64 
 4   complaint_what_happened_lemmatized         20930 non-null  object
 5   complaint_what_happened_without_stopwords  20930 non-null  object
dtypes: int64(1), object(5)
memory usage: 1.1+ MB


In [20]:
df.head()

Unnamed: 0,product,complaint_what_happened,category,category_encoded,complaint_what_happened_lemmatized,complaint_what_happened_without_stopwords
1,Debt collection,Good morning my name is XXXX XXXX and I appreciate it if you could help me put a stop to Chase Bank cardmember services. \nIn 2018 I wrote to Chase asking for debt verification and what they sent me a statement which is not acceptable. I am asking the bank to validate the debt. Instead I been receiving mail every month from them attempting to collect a debt. \nI have a right to know this information as a consumer. \n\nChase account # XXXX XXXX XXXX XXXX Thanks in advance for your help.,Credit Reporting and Debt Collection,0,morning my name apreciate it you help I put stop chase bank cardmember service write chase ask debt verification what they send I statement which aceptable ask bank validate debt receive mail month they atempte colect debt have right know information consumer chase thank advance your help,morning apreciate help stop chase bank cardmember service write chase ask debt verification send statement aceptable ask bank validate debt receive mail month atempte colect debt right know information consumer chase thank advance help
2,Credit card or prepaid card,I upgraded my XXXX XXXX card in XX/XX/2018 and was told by the agent who did the upgrade my anniversary date would not change. It turned the agent was giving me the wrong information in order to upgrade the account. XXXX changed my anniversary date from XX/XX/XXXX to XX/XX/XXXX without my consent! XXXX has the recording of the agent who was misled me.,Credit Cards and Prepaid Cards,1,upgrade my card tell agent who upgrade my aniversary date change it turn agent give I wrong information order upgrade acount change my aniversary date my consent have recording agent who mislead I,upgrade card tell agent upgrade aniversary date change turn agent wrong information order upgrade acount change aniversary date consent recording agent mislead
10,"Credit reporting, credit repair services, or other personal consumer reports","Chase Card was reported on XX/XX/2019. However, fraudulent application have been submitted my identity without my consent to fraudulently obtain services. Do not extend credit without verifying the identity of the applicant.",Credit Reporting and Debt Collection,0,card report fraudulent aplication submit my identity my consent obtain service etend credit verify identity aplicant,card report fraudulent aplication submit identity consent obtain service etend credit verify identity aplicant
11,"Credit reporting, credit repair services, or other personal consumer reports","On XX/XX/2018, while trying to book a XXXX XXXX ticket, I came across an offer for {$300.00} to be applied towards the ticket if I applied for a rewards card. I put in my information for the offer and within less than a minute, was notified via the screen that a decision could not be made. I immediately contacted XXXX and was referred to Chase Bank. I then immediately contacted Chase bank within no more than 10minutes of getting the notification on the screen and I was told by the Chase representative I spoke with that my application was denied but she could not state why. I asked for more information about the XXXX offer and she explained that even if I had been approved, the credit offer only gets applied after the first account statement and could not be used to purchase the ticket. I then explicitly told her I was glad I got denied and I was ABSOLUTELY no longer interested in the account. I asked that the application be withdrawn and the representative obliged. This all happened no later than 10mins after putting in the application on XX/XX/2018. Notwithstanding my explicit request not to proceed with the application and contrary to what I was told by the Chase representative, Chase did in fact go ahead to open a credit account in my name on XX/XX/2018. This is now being reported in my Credit Report and Chase has refused to correct this information on my credit report even though they went ahead to process an application which I did not consent to and out of their error.",Credit Reporting and Debt Collection,0,try bok ticket come acro ofer aplie ticket aplie reward card put my information ofer minute notify scren that decision make contact refer chase bank contact chase bank more minute get notification scren tell representative speak my aplication deny she state ask more information ofer she eplain have aprove credit ofer aplie first acount statement use purchase ticket tell she glad deny interested acount ask aplication withdraw representative oblige this hapene min put aplication my eplicit request proce aplication contrary what tell chase representative chase fact go open credit acount my name this report my credit report chase refuse corect information my credit report they go proce aplication which consent their eror,try bok ticket come acro ofer aplie ticket aplie reward card information ofer minute notify scren decision contact refer chase bank contact chase bank minute notification scren tell representative speak aplication deny state ask information ofer eplain aprove credit ofer aplie acount statement use purchase ticket tell glad deny interested acount ask aplication withdraw representative oblige hapene min aplication eplicit request proce aplication contrary tell chase representative chase fact open credit acount report credit report chase refuse corect information credit report proce aplication consent eror
14,Checking or savings account,my grand son give me check for {$1600.00} i deposit it into my chase account after fund clear my chase bank closed my account never paid me my money they said they need to speek with my grand son check was clear money was taking by my chase bank refuse to pay me my money my grand son called chase 2 times they told him i should call not him to verify the check owner he is out the country most the time date happen XX/XX/2018 check number XXXX claim number is XXXX with chase,Bank Account or Service,2,my grand son give I check deposit it my chase acount fund clear my chase bank close my acount pay I my money they say they ne spek my grand son check clear money take my chase bank refuse pay I my money my grand son cale chase time they tell he cal he verify check owner he country time date check number claim number chase,grand son check deposit chase acount fund clear chase bank close acount pay money ne spek grand son check clear money chase bank refuse pay money grand son cale chase time tell cal verify check owner country time date check number claim number chase


In [21]:
# Specify the file path where you want to save the modified DataFrame as a CSV file
# output_file = '/home/users/elicina/Master-Thesis/Dataset/Cleaned_Dataset.csv'

# Save the modified DataFrame to a CSV file 
# df.to_csv(output_file, index=False)