In [1]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# Load in data: df
df = pd.read_csv('https://s3.amazonaws.com/clcarverloans/data/df_cleaned.csv')

In [3]:
# Select loan 'Use' column: loan_use
loan_use = df['Use'].dropna() # Drop the one missing 'Use' datapoint

# Select loan label 'Delinquent' to compare after the cluster: labels
labels = df['Delinquent']

### Create TF-IDF Vectors of Loan Use

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
# Create vectorizor and fit_transfrom 'loan_use': vectorizor, x
vectorizer = TfidfVectorizer(stop_words='english')
x = vectorizer.fit_transform(loan_use)

### Cluster the text descriptions

In [6]:
from sklearn.cluster import KMeans

In [7]:
# Cluster vectorized descriptions: km
clusters = 8
km = KMeans(n_clusters=8)
km.fit(x)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=8, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [8]:
# Print top terms
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(clusters):
    print('Cluster {}:'.format(i+1)),
    for ind in order_centroids[i, :10]:
        print(' -{}'.format(terms[ind])),
    print

Top terms per cluster:
Cluster 1:
 -purchase
 -business
 -sell
 -supplies
 -inventory
 -expand
 -merchandise
 -buying
 -livestock
 -products
Cluster 2:
 -capital
 -working
 -additional
 -inventory
 -purchase
 -increase
 -use
 -business
 -provide
 -store
Cluster 3:
 -materials
 -purchase
 -raw
 -construction
 -buy
 -building
 -needed
 -business
 -provide
 -service
Cluster 4:
 -fertilizers
 -manure
 -purchase
 -seeds
 -farm
 -inputs
 -buy
 -supplies
 -fertilizer
 -farming
Cluster 5:
 -buy
 -sell
 -fertilizer
 -rice
 -supplies
 -clothes
 -business
 -products
 -merchandise
 -bags
Cluster 6:
 -expanding
 -invest
 -business
 -merchandise
 -food
 -retail
 -clothing
 -vegetable
 -specialized
 -selling
Cluster 7:
 -stock
 -sell
 -store
 -add
 -buy
 -purchase
 -increase
 -bags
 -items
 -business
Cluster 8:
 -products
 -additional
 -sell
 -purchase
 -fish
 -buy
 -store
 -capital
 -vegetables
 -bananas


### Add Cluster back into the dataframe

In [None]:
df_merged = pd.concat([df, pd.Series(km.labels_)], axis=1)

In [None]:
df_merged