In [1]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# Load in data: df
df = pd.read_csv('https://s3.amazonaws.com/clcarverloans/data/df_cleaned.csv')

In [3]:
# Select loan 'Use' column: loan_use
loan_use = df['Use'].dropna() # Drop the one missing 'Use' datapoint

# Select loan label 'Delinquent' to compare after the cluster: labels
labels = df['Delinquent']

In [8]:
loan_use.head()

0    To purchase construction materials and labor f...
1                        to buy construction materials
2                   to purchase construction materials
3        to expand and resupply three small businesses
4                      to expand four small businesses
Name: Use, dtype: object

### Create TF-IDF Vectors of Loan Use

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
# Create vectorizor and fit_transfrom 'loan_use': vectorizor, x
vectorizer = TfidfVectorizer(stop_words='english')
x = vectorizer.fit_transform(loan_use)

In [18]:
with open('../scripts/vectorizer.pkl', 'wb') as output:
    pickle.dump(vectorizer, output, pickle.HIGHEST_PROTOCOL)

### Cluster the text descriptions

In [6]:
from sklearn.cluster import KMeans

In [7]:
# Cluster vectorized descriptions: km
clusters = 8
km = KMeans(n_clusters=8)
km.fit(x)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=8, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [17]:
# Save clustering model: cluster_model
import pickle
with open('../scripts/cluster_model.pkl', 'wb') as output:
    pickle.dump(km, output, pickle.HIGHEST_PROTOCOL)

In [16]:
# Create vectorizor and transfrom 'loan_use': vectorizor, x
use_vec = vectorizer.transform(['to buy construction materials'])
km.predict(use_vec)[0]

7

In [8]:
# Print top terms
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(clusters):
    print('Cluster {}:'.format(i+1)),
    for ind in order_centroids[i, :10]:
        print(' -{}'.format(terms[ind])),
    print

Top terms per cluster:
Cluster 1:
 -purchase
 -sell
 -business
 -stock
 -materials
 -expand
 -inventory
 -merchandise
 -buying
 -increase
Cluster 2:
 -supplies
 -farm
 -feed
 -purchase
 -livestock
 -buy
 -farming
 -fertilizers
 -equipment
 -general
Cluster 3:
 -fertilizer
 -fertilizers
 -seeds
 -manure
 -purchase
 -buy
 -farm
 -seed
 -crops
 -pesticide
Cluster 4:
 -products
 -additional
 -sell
 -purchase
 -fish
 -buy
 -store
 -capital
 -vegetables
 -bananas
Cluster 5:
 -expanding
 -invest
 -business
 -merchandise
 -food
 -retail
 -clothing
 -vegetable
 -specialized
 -selling
Cluster 6:
 -capital
 -working
 -additional
 -inventory
 -purchase
 -increase
 -use
 -business
 -provide
 -store
Cluster 7:
 -buy
 -sell
 -clothes
 -business
 -products
 -materials
 -merchandise
 -store
 -stock
 -clothing
Cluster 8:
 -rice
 -sugar
 -oil
 -flour
 -buy
 -cooking
 -beans
 -bags
 -purchase
 -sell


### Add Cluster back into the dataframe

In [19]:
df_cluster = pd.concat([df, pd.DataFrame({'Cluster': km.labels_})], axis=1)

In [21]:
df_cluster.drop(columns='Unnamed: 0', inplace=True)