### Die wichtigen Packages importieren und der CSV-Datei einfüngen. 

In [4]:
import re
import pandas as pd

import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation


df = pd.read_csv('consumer_complaints.csv', skipfooter= 551757, engine='python')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\georg\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Die 'issue' Spalte analysieren.

In [5]:
complaints = df['issue']
print(complaints)

0       Loan modification,collection,foreclosure
1       Loan servicing, payments, escrow account
2         Incorrect information on credit report
3                             Repaying your loan
4             False statements or representation
                          ...                   
4195             Disclosure verification of debt
4196       Identity theft / Fraud / Embezzlement
4197    Loan modification,collection,foreclosure
4198             Disclosure verification of debt
4199                                       Other
Name: issue, Length: 4200, dtype: object


### Da werden die Kategorien von Spalte 'issue' angezeigt.

In [6]:
df['issue'].value_counts()

Loan modification,collection,foreclosure    808
Loan servicing, payments, escrow account    488
Incorrect information on credit report      377
Cont'd attempts collect debt not owed       327
Account opening, closing, or management     218
                                           ... 
Wrong amount charged or received              2
Charged fees or interest I didn't expect      2
Convenience checks                            1
Cash advance                                  1
Cash advance fee                              1
Name: issue, Length: 64, dtype: int64

### Bereinigung des Beschwerdetextes

#### Schritt 1: Kleinbuchstabenumwandlung
#### Schritt 2: Tokenisierung
#### Schritt 3: Entfernung von Stoppwörtern
#### Schritt 4: Stemming
#### Schritt 5: Lemmatisierung 

In [7]:
df['issue'] = df['issue'].str.lower()

df['issue'] = df['issue'].apply(word_tokenize)

stop_words = set(stopwords.words("english"))
df['issue'] = df['issue'].apply(lambda text: [token for token in text if token not in stop_words])

stemmer = SnowballStemmer("english")
df['issue'] = df['issue'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])

lemma = WordNetLemmatizer()
df['issue'] = df['issue'].apply(lambda tokens: [lemma.lemmatize(token) for token in tokens])

### Die Beschwerden sind so strukturiert, dass sie dazu dienen, den Wortschatz zu erweitern.

In [8]:
complaints = []
for row in df['issue']:
    complaints.append(row)

results = [' '.join(ele) for ele in df['issue']]
complaints = ' '.join(results)
complaints = word_tokenize(complaints)

### Der Wortschatz wurde aufgebaut, wobei jedes einzelne bereinigte Wort aus den Beschwerden nur einmal im Vokabular erscheint.

In [9]:
voc = []
for w in complaints:
    if w not in voc:
        voc.append(w)
print(voc)

['loan', 'modif', ',', 'collect', 'foreclosur', 'servic', 'payment', 'escrow', 'account', 'incorrect', 'inform', 'credit', 'report', 'repay', 'fals', 'statement', 'represent', 'applic', 'process', 'delay', 'line', 'increase/decreas', 'deposit', 'withdraw', 'cont', "'d", 'attempt', 'debt', 'owe', 'decis', '/', 'underwrit', 'origin', 'mortgag', 'broker', 'communic', 'tactic', 'late', 'fee', 'improp', 'contact', 'share', 'info', 'bill', 'disput', 'making/receiv', 'send', 'money', 'reward', 'manag', 'lea', 'settlement', 'cost', 'taking/threaten', 'illeg', 'action', 'disclosur', 'verif', 'ident', 'theft', 'fraud', 'embezzl', 'custom', 'relat', 'forbear', 'workout', 'plan', 'closing/cancel', 'use', 'open', 'close', 'compani', "'s", 'investig', 'apr', 'interest', 'rate', 'problem', 'unabl', 'pay', 'monitor', 'protect', 'balanc', 'transfer', 'get', 'report/credit', 'score', 'wrong', 'amount', 'charg', 'receiv', 'term', 'chang', 'scam', 'card', 'determin', 'take', 'transact', 'issu', 'caus', 'f

### Einrichten des Wörterbuchs für die Bag-of-Words-Methode, das die Häufigkeit jedes Worts in einer Beschwerde zählt und das BoW-Model.

In [10]:
def CalcBow(voc, complaints):
    return {word: complaints.count(word) for word in voc} 
dataframe_Bow = pd.DataFrame([CalcBow(voc, r) for r in df['issue']])
print(dataframe_Bow)

      loan  modif  ,  collect  foreclosur  servic  payment  escrow  account  \
0        1      1  2        1           1       0        0       0        0   
1        1      0  2        0           0       1        1       1        1   
2        0      0  0        0           0       0        0       0        0   
3        1      0  0        0           0       0        0       0        0   
4        0      0  0        0           0       0        0       0        0   
...    ...    ... ..      ...         ...     ...      ...     ...      ...   
4195     0      0  0        0           0       0        0       0        0   
4196     0      0  0        0           0       0        0       0        0   
4197     1      1  2        1           1       0        0       0        0   
4198     0      0  0        0           0       0        0       0        0   
4199     0      0  0        0           0       0        0       0        0   

      incorrect  ...  unsolicit  issuanc  advertis 

### Erstellung BoW mit der verwendung von sklearn

In [11]:
vectorizer1 = CountVectorizer()
data = vectorizer1.fit_transform(results)
data = pd.DataFrame(data.toarray(), columns=vectorizer1.get_feature_names_out())
print(data.head())

   account  action  advanc  advertis  amount  applic  apr  atm  attempt  \
0        0       0       0         0       0       0    0    0        0   
1        1       0       0         0       0       0    0    0        0   
2        0       0       0         0       0       0    0    0        0   
3        0       0       0         0       0       0    0    0        0   
4        0       0       0         0       0       0    0    0        0   

   avail  ...  transact  transfer  unabl  underwrit  unsolicit  use  verif  \
0      0  ...         0         0      0          0          0    0      0   
1      0  ...         0         0      0          0          0    0      0   
2      0  ...         0         0      0          0          0    0      0   
3      0  ...         0         0      0          0          0    0      0   
4      0  ...         0         0      0          0          0    0      0   

   withdraw  workout  wrong  
0         0        0      0  
1         0        0

### TF-IDF

In [12]:
vectorizer2 = TfidfVectorizer(min_df=1)
mod = vectorizer2.fit_transform(results)
data_IDF = pd.DataFrame(mod.toarray(), columns=vectorizer2.get_feature_names_out())
print(data_IDF.head())

    account  action  advanc  advertis  amount  applic  apr  atm  attempt  \
0  0.000000     0.0     0.0       0.0     0.0     0.0  0.0  0.0      0.0   
1  0.425306     0.0     0.0       0.0     0.0     0.0  0.0  0.0      0.0   
2  0.000000     0.0     0.0       0.0     0.0     0.0  0.0  0.0      0.0   
3  0.000000     0.0     0.0       0.0     0.0     0.0  0.0  0.0      0.0   
4  0.000000     0.0     0.0       0.0     0.0     0.0  0.0  0.0      0.0   

   avail  ...  transact  transfer  unabl  underwrit  unsolicit  use  verif  \
0    0.0  ...       0.0       0.0    0.0        0.0        0.0  0.0    0.0   
1    0.0  ...       0.0       0.0    0.0        0.0        0.0  0.0    0.0   
2    0.0  ...       0.0       0.0    0.0        0.0        0.0  0.0    0.0   
3    0.0  ...       0.0       0.0    0.0        0.0        0.0  0.0    0.0   
4    0.0  ...       0.0       0.0    0.0        0.0        0.0  0.0    0.0   

   withdraw  workout  wrong  
0       0.0      0.0    0.0  
1       0.0   

# Semantische Analyse
### LDA - Latent Dirichlet Allocation

In [13]:
lda_mod1 = LatentDirichletAllocation(n_components=7, learning_method='online', random_state=42)
lda = lda_mod1.fit_transform(data)
print("Topic: ")
for i, topic in enumerate(lda[0]):
    print("Topic: ",i,": ",topic*100,"%")

Topic: 
Topic:  0 :  2.857142875724331 %
Topic:  1 :  2.8598540844835294 %
Topic:  2 :  2.8571429007493934 %
Topic:  3 :  2.8582703805523084 %
Topic:  4 :  2.857142909067227 %
Topic:  5 :  2.8595130062591916 %
Topic:  6 :  82.85093384316401 %


### Die Schlüsselwörter für jedes Thema.

In [14]:
voca1 = vectorizer1.get_feature_names_out()
num_topics = 7

for i in range(num_topics):
    comp = lda_mod1.components_[i]
    sd_words = sorted(zip(voca1, comp), key=lambda x: x[1], reverse=True)[:10]

    print(f"Topic {i}:")
    print(' '.join(word for word, _ in sd_words))
    print()

Topic 0:
credit incorrect inform report communic tactic bill disput protect money

Topic 1:
debt collect attempt cont owe applic broker mortgag origin line

Topic 2:
credit report investig compani get unabl score fee account late

Topic 3:
manag account close open loan repay info improp contact share

Topic 4:
problem low fund caus pay unabl card credit decis underwrit

Topic 5:
servic account loan payment escrow disclosur verif debt statement process

Topic 6:
loan foreclosur modif collect withdraw deposit lea action threaten illeg



### LSA - Latente semantische Analyse 

In [15]:
lsa_mod2 = TruncatedSVD(n_components=7, algorithm='randomized', n_iter=10)
lsa = lsa_mod2.fit_transform(mod)
print("Topic: ")
for i, topic in enumerate(lsa[0]):
    print("Topic: ",i,": ",topic*100,"%")

Topic: 
Topic:  0 :  97.85178975575124 %
Topic:  1 :  -17.618934235813434 %
Topic:  2 :  -0.016028419919769638 %
Topic:  3 :  -10.302836566059026 %
Topic:  4 :  1.2096440185370292 %
Topic:  5 :  -8.697545631096092e-07 %
Topic:  6 :  8.005335195532323e-07 %


### Die Schlüsselwörter für jedes Thema.

In [16]:
voca2 = vectorizer2.get_feature_names_out()
num_topics = 7

for i in range(num_topics):
    comp = lsa_mod2.components_[i]
    sd_words = sorted(zip(voca2, comp), key=lambda x: x[1], reverse=True)[:10]

    print(f"Topic {i}:")
    print(' '.join(word for word, _ in sd_words))
    print()

Topic 0:
foreclosur modif collect loan escrow servic payment account attempt cont

Topic 1:
escrow servic payment account loan manag close open repay lea

Topic 2:
report credit inform incorrect compani investig score get unabl use

Topic 3:
debt attempt cont owe collect verif disclosur account escrow servic

Topic 4:
manag close open account lea foreclosur modif collect line delinqu

Topic 5:
communic tactic lea manag loan take unabl pay shop get

Topic 6:
deposit withdraw unabl pay illeg taking threaten action repay get

