# Data Exploration - Lending Club

## 0. Prerequesites - Installing of packages and Loading Files

In [None]:
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning) 

import pandas as pd
import toml
import os
os.chdir("..")

from nltk import word_tokenize 
from nltk.util import ngrams

import re
import numpy as np

import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline
import pyLDAvis.gensim_models as gensimvis

from sklearn.feature_extraction.text import TfidfVectorizer
import re, nltk, spacy, gensim

from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors

from collections import Counter

from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

from src.utils import *

import pylab as pl

from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

In [None]:
config = toml.load("config.toml")

In [None]:
df_acc = pd.read_csv(config["data"]["lending_club_acc"])
df_rej = pd.read_csv(config["data"]["lending_club_rej"])

## 1. First Investigations

In [None]:
df_rej.info(verbose=True, null_counts=True)

In [None]:
df_acc.shape

In [None]:
df_rej["Loan Title"]

In [None]:
df_acc["id"].nunique()

In [None]:
with pd.option_context("display.min_rows", 20):
    print(df_rej["Loan Title"].value_counts())

In [None]:
df_acc["loan_status"].unique()
with pd.option_context("display.min_rows", 50):
    print(df_acc["loan_status"].value_counts())

## 2. Summarization & Feature Analysis

In [None]:
print(df_acc.shape)
df_acc = df_acc[df_acc['desc'].notnull() & df_acc['title'].notnull()]
print(df_acc.shape)

In [None]:
plt.rcParams["figure.figsize"] = (7,6)
plt.rcParams["figure.dpi"] = 300
df_acc['desc_word_count'] = df_acc['desc'].str.count(' ') + 1

df_acc['desc_word_count'].hist(bins=60, grid=False, figsize=(12,8), color='#2077B4', zorder=2, rwidth=0.9)
pl.title("Histogram of Text Lengths for Loan Goal Descriptions", fontsize=22)
pl.xlabel("Text Length", fontsize=18)
pl.ylabel("Cumulative count", fontsize=18)
pl.xticks(fontsize=14)
pl.yticks(fontsize=14)
pl.axvline(x=45.5, ymin=0, ymax=1, linewidth=3, color="#000000")
pl.savefig('charts/hist_text_lengths.png', dpi=150)

In [None]:
df_acc['desc_word_count'].mean()

In [None]:
df_acc = df_acc[['desc', 'title', 'desc_word_count']]
df_acc = df_acc[df_acc['desc_word_count'] > 20]

In [None]:
# Longest Loan Goal Title
df_acc.loc[df_acc['desc_word_count'].idxmax()]['desc']

In [None]:
df_acc["desc"].iloc[0]

In [None]:
nlp_cols = ["title", "desc"]
for col in nlp_cols:
    replace_empties = lambda x: x if re.search("\S", x) else np.NaN
    df_acc[col] = df_acc[col].map(replace_empties, na_action="ignore")

description = df_acc[nlp_cols].describe()
description

In [None]:
num_loans = df_acc.shape[0]

for col in nlp_cols:
    percentage = int(description.at["count", col] / num_loans * 100)
    print(f"`{col}` is used in {percentage}% of loans in the dataset.")

percentage = int(description.at["freq", "title"] / num_loans * 100)
print(f'The title "Debt consolidation" is used in {percentage}% of loans.')

## Delete HTML Tags and 'Borrower Added' Tags

In [None]:
pattern = "^\s*Borrower added on \d\d/\d\d/\d\d > "

other_desc_map = df_acc["desc"].map(
    lambda x: True if pd.isna(x) or re.search(pattern, x, re.I) else False
)
other_descs = df_acc["desc"][other_desc_map]
other_descs.value_counts()

In [None]:
# Cleans all <Borrower added...> and <br> tags
def clean_desc(desc):
    if pd.isna(desc):
        return desc
    else:
        desc = re.sub(
            "^\s*Borrower added on \d\d/\d\d/\d\d > |<br>", lambda x: " ", desc
        ).strip()
        return re.sub(
            "<br>", lambda x: " ", desc
        ).strip()
df_acc["desc"] = df_acc["desc"].map(clean_desc)

In [None]:
df_acc["desc"]

## 4. Topic Analysis

## 4.1 Text Cleanup

In [None]:
df_acc = df_acc.head(1000)

In [None]:
corpus_desc = []
corpus_title = []

for element in df_acc["desc"].tolist():
    corpus_desc.append(element.split())

for element in df_acc["title"].tolist():
    corpus_title.append(element.split())

corpus_flat = [item for sublist in corpus_desc for item in sublist]
for i in range(len(corpus_flat)):
    corpus_flat[i] = corpus_flat[i].lower()
    
counts = Counter(corpus_flat)
#print(counts)
df_word_counts = pd.DataFrame.from_dict(counts.most_common())

In [None]:
plt.rcParams["figure.figsize"] = (7,6)
plt.rcParams["figure.dpi"] = 300

y_pos = np.arange(25)
plt.figure(figsize=(12,8))
plt.bar(y_pos, df_word_counts[1][:25], align='center', color='#2077B4')
plt.xticks(y_pos, df_word_counts[0][:25].values,rotation='vertical')
plt.ylabel('Frequency', fontsize=18)
plt.xlabel('Tokens', fontsize=18)
pl.xticks(fontsize=14)
pl.yticks(fontsize=14)
plt.title('Top 25 tokens by occurance in LendingClub dataset', fontsize=22)
plt.savefig('charts/top_25_tokens.png', dpi=150)

In [None]:
plt.rcParams["figure.figsize"] = (7,6)
plt.rcParams["figure.dpi"] = 300

y_pos = np.arange(500)
plt.figure(figsize=(12,8))
s = 1
expected_zipf = [df_word_counts[1][0]/(i+1)**s for i in y_pos]
plt.bar(y_pos, df_word_counts[1][:500], align='center',color = "#2077B4")
#plt.plot(y_pos, expected_zipf, color='r', linestyle='--',linewidth=2,alpha=0.5)
plt.ylabel('Frequency',fontsize=18)
pl.xticks(fontsize=14)
pl.yticks(fontsize=14)
plt.title('Top 500 tokens in LendingClub dataset', fontsize=22)
plt.savefig('charts/top_500_tokens.png', dpi=150)