# Kiva Project 

Predict whether a Kiva loan application will default.


# Preliminaries: Inspect and Set up environment

In [361]:
!which python

/usr/local/bin/python


In [362]:
!python --version

Python 3.7.10


In [363]:
!echo $PYTHONPATH

/env/python


In [364]:
pip install unidecode textstat



In [365]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [366]:
import pandas as pd
import numpy as np

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [367]:
import string

In [368]:
import datetime
print(datetime.datetime.now())

2021-04-09 22:36:19.389176


In [369]:
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [370]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Read Data

We'll read the data from the links that Uncle Steve provided.

In [447]:
# The labeled training data
df = pd.read_csv("https://drive.google.com/uc?export=download&id=1dzzVbgHphbCf7kvq9IKiIhwzmxPbuH4s")

In [448]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6138 entries, 0 to 6137
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   loan_id    6138 non-null   int64 
 1   en_clean   6138 non-null   object
 2   defaulted  6138 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 144.0+ KB


# EDA

In [449]:
df.head()

Unnamed: 0,loan_id,en_clean,defaulted
0,7779,She opened a colmado out of the side of her ho...,0
1,2777,(First Loan): Joffre continues to run his loc...,1
2,6007,"Dina Santana is the mother of two children, Ju...",0
3,76,"Rosemary is 50 years old, single, and has 6 ch...",1
4,4217,"Segundo has a shop where he sells animal feed,...",0


In [450]:
df['defaulted'].value_counts()

0    3102
1    3036
Name: defaulted, dtype: int64

In [451]:
df = df.drop_duplicates('en_clean')
df.shape

(6129, 3)

In [452]:
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")

In [453]:
# We can use spacy to show all the named entities in a given document.

doc = nlp(df.iloc[4001].en_clean)
displacy.render(doc, style="ent", jupyter=True)

Add Text Augmentation

In [454]:
#!pip install textaugment
nltk.download('averaged_perceptron_tagger')
from textaugment import Wordnet
from tqdm import tqdm
from sklearn.utils import shuffle

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [455]:
def augment_text(df,samples=1000):
    t = Wordnet()
    new_text_majority=[]
    
    ##selecting the majority class samples
    df_n=df[df.defaulted==1].reset_index(drop=True)

    ## data augmentation loop
    for i in tqdm(np.random.randint(0,len(df_n),samples)):
        
            text = df_n.iloc[i]['en_clean']
            augmented_text = t.augment(text)
            new_text_majority.append(augmented_text)
    
    ## dataframe
    new=pd.DataFrame({'en_clean':new_text_majority,'defaulted':1})
    df=shuffle(df.append(new).reset_index(drop=True))


    new_text_minority=[]

    ##selecting the minority class samples
    df_n=df[df.defaulted==0].reset_index(drop=True)

    ## data augmentation loop
    for i in tqdm(np.random.randint(0,len(df_n),samples)):
        
            text = df_n.iloc[i]['en_clean']
            augmented_text = t.augment(text)
            new_text_minority.append(augmented_text)
    

    ## dataframe
    new=pd.DataFrame({'en_clean':new_text_minority,'defaulted':0})
    df=shuffle(df.append(new).reset_index(drop=True))
    return df

In [456]:
df = augment_text(df)

100%|██████████| 1000/1000 [00:08<00:00, 115.37it/s]
100%|██████████| 1000/1000 [00:07<00:00, 125.95it/s]


In [457]:
df['defaulted'].value_counts()

0    4096
1    4033
Name: defaulted, dtype: int64

Add Sentiment Label Score Feature using Vander package

In [458]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [459]:
a = 'This was a good movie.'
sid.polarity_scores(a)


{'compound': 0.4404, 'neg': 0.0, 'neu': 0.508, 'pos': 0.492}

In [460]:
df['scores'] = df['en_clean'].apply(lambda review: sid.polarity_scores(review))


In [461]:
df['compound']  = df['scores'].apply(lambda score_dict: score_dict['compound'])
df['comp_score'] = df['compound'].apply(lambda c: 1 if c >=0 else 0)

df.head(20)

Unnamed: 0,loan_id,en_clean,defaulted,scores,compound,comp_score
465,6376.0,Ramona is 45 years old and has 10 children and...,0,"{'neg': 0.0, 'neu': 0.95, 'pos': 0.05, 'compou...",0.5927,1
7578,,"this group, as a part of a larger, successful ...",0,"{'neg': 0.0, 'neu': 0.892, 'pos': 0.108, 'comp...",0.9287,1
5092,1534.0,Beatrice Anne Wangu is a single mother of 2 ch...,1,"{'neg': 0.02, 'neu': 0.842, 'pos': 0.139, 'com...",0.9467,1
2854,7537.0,"In the words of Luchi Aquino, the group has do...",0,"{'neg': 0.017, 'neu': 0.846, 'pos': 0.136, 'co...",0.9647,1
4625,1088.0,"I am a married man, a father of 3. I sell cook...",1,"{'neg': 0.0, 'neu': 0.839, 'pos': 0.161, 'comp...",0.5859,1
4306,46.0,Catherine is 49 years old. She is a single lad...,1,"{'neg': 0.0, 'neu': 0.916, 'pos': 0.084, 'comp...",0.9245,1
3716,,grace waiyego is a member of the easter mother...,1,"{'neg': 0.0, 'neu': 0.928, 'pos': 0.072, 'comp...",0.8316,1
3623,,naomi wanjiku kimotho live born 40 years ago i...,1,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1
7244,,virginia is an active member of kabuta ladies ...,0,"{'neg': 0.0, 'neu': 0.917, 'pos': 0.083, 'comp...",0.7778,1
1265,766.0,She is married with 3 children where all of th...,1,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1


In [462]:
import spacy
from functools import partial
def spacy_tokenize(text, nlp):
    return [x.orth_ for x in nlp(text)]

nlp = spacy.load('en', disable=['ner', 'parser', 'tagger'])
tok = partial(spacy_tokenize, nlp=nlp)

In [463]:
#funtion to get 'top N' or 'bottom N' words
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
def get_n_words(corpus, direction, n):
    vec = CountVectorizer(stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    if direction == "top":
        words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    else:
        words_freq =sorted(words_freq, key = lambda x: x[1], reverse=False)
    return words_freq[:n]

In [464]:
#10 most common and 10 most rare words
common_words = get_n_words(df['en_clean'], "top", 15)
rare_words = get_n_words(df['en_clean'], "bottom", 500)
#common_words, rare_words

In [465]:
#Removing common and rare words
#%time df['en_clean'] = df['en_clean'].map(lambda x : ' '.join([w for w in x.split() if w not in common_words]))
%time df['en_clean'] = df['en_clean'].map(lambda x : ' '.join([w for w in x.split() if w not in rare_words]))

CPU times: user 12.7 s, sys: 40.4 ms, total: 12.8 s
Wall time: 12.7 s


In [466]:
df.head(10)

Unnamed: 0,loan_id,en_clean,defaulted,scores,compound,comp_score
465,6376.0,Ramona is 45 years old and has 10 children and...,0,"{'neg': 0.0, 'neu': 0.95, 'pos': 0.05, 'compou...",0.5927,1
7578,,"this group, as a part of a larger, successful ...",0,"{'neg': 0.0, 'neu': 0.892, 'pos': 0.108, 'comp...",0.9287,1
5092,1534.0,Beatrice Anne Wangu is a single mother of 2 ch...,1,"{'neg': 0.02, 'neu': 0.842, 'pos': 0.139, 'com...",0.9467,1
2854,7537.0,"In the words of Luchi Aquino, the group has do...",0,"{'neg': 0.017, 'neu': 0.846, 'pos': 0.136, 'co...",0.9647,1
4625,1088.0,"I am a married man, a father of 3. I sell cook...",1,"{'neg': 0.0, 'neu': 0.839, 'pos': 0.161, 'comp...",0.5859,1
4306,46.0,Catherine is 49 years old. She is a single lad...,1,"{'neg': 0.0, 'neu': 0.916, 'pos': 0.084, 'comp...",0.9245,1
3716,,grace waiyego is a member of the easter mother...,1,"{'neg': 0.0, 'neu': 0.928, 'pos': 0.072, 'comp...",0.8316,1
3623,,naomi wanjiku kimotho live born 40 years ago i...,1,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1
7244,,virginia is an active member of kabuta ladies ...,0,"{'neg': 0.0, 'neu': 0.917, 'pos': 0.083, 'comp...",0.7778,1
1265,766.0,She is married with 3 children where all of th...,1,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1


# Train Test Split

In [467]:
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV

X = df['en_clean']
y = df['defaulted']

# So that we can evaluate how well our model is performing, we split our training data
# into training and validation.

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42)

In [468]:
X

465     Ramona is 45 years old and has 10 children and...
7578    this group, as a part of a larger, successful ...
5092    Beatrice Anne Wangu is a single mother of 2 ch...
2854    In the words of Luchi Aquino, the group has do...
4625    I am a married man, a father of 3. I sell cook...
                              ...                        
2766    Henry Gachigua Muturi is 60 years and has seve...
41      ruth is a good member of kahumbu kwirera mothe...
8065    sandra is a straightforward and humble woman w...
7401    margaret be marry with five children all in pr...
233     Dominga is a hard working woman who started he...
Name: en_clean, Length: 8129, dtype: object

# Feature Engineering and Extraction Pipeline

In [469]:
import re
import unidecode

# A nice preprocessing function that we can pass to CountVectorizer/TfidfVectorizer
def my_preprocess(doc):

    # Lowercase everything
    res = doc.lower()
    
    # Remove any "weird" characters
    res = unidecode.unidecode(res)

    #print(res)

    # TODO: What else?
    SYM_REPLACE = re.compile('[/(){}\[\]\|@,;]')
    REM_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
    res = SYM_REPLACE.sub(' ', res) 
    res = REM_SYMBOLS_RE.sub('', res) 

    # Create Lemmatizer object
    wnl = WordNetLemmatizer()
    list2 = nltk.word_tokenize(res)
    res = ' '.join([wnl.lemmatize(words) for words in list2])

    # Remove stopwords
    all_stopwords = nlp.Defaults.stop_words
    text_tokens = word_tokenize(res)
    tokens_without_sw = [word for word in text_tokens if not word in all_stopwords]
    res = (" ").join(tokens_without_sw)

    return res

In [470]:
# These functions will calculate additional features on the document.
# They will be put into the Pipeline, called via the FunctionTransformer() function.
# Each one takes an entire corpus (as a list of documents), and should return
# an array of feature values (one for each document in the corpus).
# These functions can do anything they want; I've made most of them quick
# one-liners Hopefully the names of the functions will make them self explanitory.

def doc_length(corpus):
    #print(corpus)
    return np.array([len(doc) for doc in corpus]).reshape(-1, 1)

def num_exclamation_marks(corpus):
    return np.array([doc.count('!') for doc in corpus]).reshape(-1, 1)

def count_loan(corpus):
    return np.array([doc.count('loan') for doc in corpus]).reshape(-1, 1)

def generate_sentiment_score(corpus):
  score_dict = [sid.polarity_scores(doc) for doc in corpus]
  score = []
  for key in score_dict:
    #print(key['compound'])
    score.append(key['compound'])

    # if key['compound'] >= 0:
    #   score.append(1)
    # else:
    #   score.append(1)
  return np.array(score).reshape(-1, 1)


In [471]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

#0.86 - F1 macro using random forest
vectorizer = CountVectorizer(preprocessor=my_preprocess,  tokenizer=tok,min_df=2, 
                             max_df=0.5, max_features=1000,
                             stop_words='english', ngram_range=(1, 2))
# vectorizer = CountVectorizer(preprocessor=my_preprocess, min_df=2, 
#                              max_df=0.5, max_features=1000,
#                              ngram_range=(1, 2))
# This vectorizer will be used to create the BOW features.
# vectorizer = TfidfVectorizer(preprocessor=my_preprocess, 
#                              tokenizer=tok,
#                              max_features = 1000, 
#                              use_idf=True,
#                              stop_words='english',
#                              min_df=2, max_df=10, ngram_range=[1,2])

rf = RandomForestClassifier(criterion='entropy', random_state=1)

# We will "union" together the BOW features and the custom-created features we
# created in the cell above.
feature_processing =  FeatureUnion([ 
    ('bow', Pipeline([('vectorizer', vectorizer), ])),
    ('doc_length', FunctionTransformer(doc_length, validate=False)),
    ('num_exclamation_marks', FunctionTransformer(num_exclamation_marks, validate=False)),
    ('sentiment_score', FunctionTransformer(generate_sentiment_score, validate=False)),  
   # ('count_loan', FunctionTransformer(count_loan, validate=False)),  
])

pipe = Pipeline([('features', feature_processing), ('clf', rf)])

# Model Training/Tuning/Cross Validation


In [472]:
from sklearn.model_selection import GridSearchCV

# The names of the hypter parameters may look a bit funny; it's based on how they
# are added to the Pipeline object above (and seperated with double underscores)
# param_grid = {
#     'features__bow__vectorizer__max_features': [1000,1200, 1300, 1500],
#     'features__bow__vectorizer__use_idf': [True, False],
#     'clf__n_estimators': [10, 100,200],
#     'features__doc_length' : [True, False],
#     'features__num_exclamation_marks' : [True, False],
#     'features__sentiment_score' : [True, False],

# }

param_grid = {
    'features__bow__vectorizer__max_features': [1000,1150,1200, 1300],
    'clf__n_estimators': [100,200,250,300],
    'features__bow__vectorizer__ngram_range': [(1, 2)],
}


#cv = StratifiedKFold(n_splits=8)
search = GridSearchCV(pipe, 
                      param_grid, 
                      cv=5, 
                      n_jobs=5, 
                      scoring='f1_macro', 
                      return_train_score=True, 
                      verbose=2)

search = search.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:  6.6min
[Parallel(n_jobs=5)]: Done  80 out of  80 | elapsed: 15.6min finished
  'stop_words.' % sorted(inconsistent))


In [473]:
print("Best parameter (CV score: %0.5f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score: 0.89729):
{'clf__n_estimators': 200, 'features__bow__vectorizer__max_features': 1150, 'features__bow__vectorizer__ngram_range': (1, 2)}


In [474]:
# param_grid = {
#     'features__bow__vectorizer__max_features': [500, 1000,1200,1500,2000],
#     'features__bow__vectorizer__use_idf': [True, False],
#     'clf__n_estimators': [100,200],
# }

param_grid = {
    'features__bow__vectorizer__max_features': [1000,1150,1200, 1300],
    'clf__n_estimators': [100,200,250],
    'features__bow__vectorizer__ngram_range': [(1, 2)],
}

cv = StratifiedKFold(n_splits=8)
search = RandomizedSearchCV(pipe, param_distributions=param_grid, cv=cv, n_jobs=5,  scoring='f1_macro', return_train_score=True, verbose=8)
search = search.fit(X_train, y_train)

Fitting 8 folds for each of 10 candidates, totalling 80 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   8 tasks      | elapsed:  2.4min
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:  7.1min
[Parallel(n_jobs=5)]: Done  62 tasks      | elapsed: 12.9min
[Parallel(n_jobs=5)]: Done  80 out of  80 | elapsed: 15.8min finished
  'stop_words.' % sorted(inconsistent))


In [475]:
print("Best parameter (CV score: %0.5f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score: 0.89713):
{'features__bow__vectorizer__ngram_range': (1, 2), 'features__bow__vectorizer__max_features': 1200, 'clf__n_estimators': 200}


In [476]:
# Print out the results of hyperparmater tuning

def cv_results_to_df(cv_results):
    results = pd.DataFrame(list(cv_results['params']))
    results['mean_fit_time'] = cv_results['mean_fit_time']
    results['mean_score_time'] = cv_results['mean_score_time']
    results['mean_train_score'] = cv_results['mean_train_score']
    results['std_train_score'] = cv_results['std_train_score']
    results['mean_test_score'] = cv_results['mean_test_score']
    results['std_test_score'] = cv_results['std_test_score']
    results['rank_test_score'] = cv_results['rank_test_score']

    results = results.sort_values(['mean_test_score'], ascending=False)
    return results

results = cv_results_to_df(search.cv_results_)
results

Unnamed: 0,features__bow__vectorizer__ngram_range,features__bow__vectorizer__max_features,clf__n_estimators,mean_fit_time,mean_score_time,mean_train_score,std_train_score,mean_test_score,std_test_score,rank_test_score
4,"(1, 2)",1200,200,30.117843,2.863831,1.0,0.0,0.897131,0.009141,1
8,"(1, 2)",1200,250,32.459792,2.891084,1.0,0.0,0.896141,0.010902,2
3,"(1, 2)",1150,250,32.41831,2.893218,1.0,0.0,0.895312,0.011882,3
0,"(1, 2)",1000,250,33.03606,2.896794,1.0,0.0,0.894667,0.009284,4
2,"(1, 2)",1300,200,30.495811,2.893356,1.0,0.0,0.894504,0.01171,5
9,"(1, 2)",1300,100,25.811119,2.855398,1.0,0.0,0.894004,0.010988,6
7,"(1, 2)",1150,200,30.379202,2.883795,1.0,0.0,0.89351,0.01191,7
1,"(1, 2)",1150,100,26.272253,2.852501,1.0,0.0,0.892524,0.010039,8
5,"(1, 2)",1000,100,26.091417,2.843392,1.0,0.0,0.889568,0.00997,9
6,"(1, 2)",1200,100,26.139821,2.851814,1.0,0.0,0.889404,0.012025,10


# Model Assessment

In [485]:
y_val_pred = search.predict(X_val)

In [486]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

print(confusion_matrix(y_true = y_val, y_pred = y_val_pred))

class_names = [str(x) for x in search.best_estimator_.classes_]
print(classification_report(y_true = y_val, y_pred = y_val_pred, target_names=class_names))

[[966  84]
 [122 861]]
              precision    recall  f1-score   support

           0       0.89      0.92      0.90      1050
           1       0.91      0.88      0.89       983

    accuracy                           0.90      2033
   macro avg       0.90      0.90      0.90      2033
weighted avg       0.90      0.90      0.90      2033



# Kaggle Predictions

In [487]:
# Read in the unlabeled testing data (for the Kaggle competition)
df_test = pd.read_csv("https://drive.google.com/uc?export=download&id=1EVWfyqQOd_W2uTKrr4JTD2iFrEZHoOHT")

In [488]:
# Use our pipeline to make predictions; then output predictions to a CSV file.

pred_test = search.predict(df_test['en_clean'])
my_submission = pd.DataFrame({'id': df_test['loan_id'], 'predicted': pred_test})
my_submission.head()

# This command will save the file to the local cloud instance; it will be deleted
# as soon as this Notebooks session ends.
my_submission.to_csv('my_submission.csv', index=False)

Unnamed: 0,id,predicted
0,6607,0
1,154,1
2,7402,0
3,2617,1
4,6464,0


In [489]:
# Download predictions file to your local computer

from google.colab import files
files.download('my_submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [490]:
print(datetime.datetime.now())

2021-04-10 00:04:26.910694


In [None]:
from google.colab import drive
drive.mount('/content/drive')