In [None]:
import pandas as pd
import numpy as np

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
import string

nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [None]:
pip install unidecode textstat

Collecting unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/9e/25/723487ca2a52ebcee88a34d7d1f5a4b80b793f179ee0f62d5371938dfa01/Unidecode-1.2.0-py2.py3-none-any.whl (241kB)
[K     |████████████████████████████████| 245kB 8.3MB/s 
[?25hCollecting textstat
[?25l  Downloading https://files.pythonhosted.org/packages/ca/b1/ab40a00b727a0d209402d1be6aa3f1bc75bd03678b59ace8507b08bf12f5/textstat-0.7.0-py3-none-any.whl (99kB)
[K     |████████████████████████████████| 102kB 7.3MB/s 
[?25hCollecting pyphen
[?25l  Downloading https://files.pythonhosted.org/packages/7c/5a/5bc036e01389bc6a6667a932bac3e388de6e7fa5777a6ff50e652f60ec79/Pyphen-0.10.0-py3-none-any.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 9.4MB/s 
[?25hInstalling collected packages: unidecode, pyphen, textstat
Successfully installed pyphen-0.10.0 textstat-0.7.0 unidecode-1.2.0


In [None]:
# The labeled training data
df = pd.read_csv("https://drive.google.com/uc?export=download&id=1dzzVbgHphbCf7kvq9IKiIhwzmxPbuH4s")

In [None]:
df.head()

Unnamed: 0,loan_id,en_clean,defaulted
0,7779,She opened a colmado out of the side of her ho...,0
1,2777,(First Loan): Joffre continues to run his loc...,1
2,6007,"Dina Santana is the mother of two children, Ju...",0
3,76,"Rosemary is 50 years old, single, and has 6 ch...",1
4,4217,"Segundo has a shop where he sells animal feed,...",0


#### **Dop duplicates and drop loan_id**

In [None]:
df.shape

(6138, 3)

In [None]:
df = df.drop_duplicates('en_clean')
df.shape

(6129, 3)

In [None]:
df = df.drop(['loan_id'], axis=1)
df.head()

Unnamed: 0,en_clean,defaulted
0,She opened a colmado out of the side of her ho...,0
1,(First Loan): Joffre continues to run his loc...,1
2,"Dina Santana is the mother of two children, Ju...",0
3,"Rosemary is 50 years old, single, and has 6 ch...",1
4,"Segundo has a shop where he sells animal feed,...",0


In [None]:
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")

## **TF-IDF**


TF-IDF is a measure of originality of a word by comparing the number of times a word appears in a doc with the number of docs the word appears in.

Some semantic information is preserved as uncommon words are given more importance than common words in TF-IDF.

**E.g. 'She is beautiful'**, Here 'beautiful will have more importance than 'she' or 'is'.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# **Text Preprocessing**

In [None]:
!pip install git+https://github.com/laxmimerit/preprocess_kgptalkie.git

Collecting git+https://github.com/laxmimerit/preprocess_kgptalkie.git
  Cloning https://github.com/laxmimerit/preprocess_kgptalkie.git to /tmp/pip-req-build-auy9o4kh
  Running command git clone -q https://github.com/laxmimerit/preprocess_kgptalkie.git /tmp/pip-req-build-auy9o4kh
Building wheels for collected packages: preprocess-kgptalkie
  Building wheel for preprocess-kgptalkie (setup.py) ... [?25l[?25hdone
  Created wheel for preprocess-kgptalkie: filename=preprocess_kgptalkie-0.1.3-cp37-none-any.whl size=11743 sha256=fa5291050b1c19001a0c38071c4948d2c909707a443a95c5be4c1948c6da415d
  Stored in directory: /tmp/pip-ephem-wheel-cache-z8zb7lq5/wheels/a8/18/22/90afa4bd43247fb9a75b710a4a3fcd94966c022ce9e3c7d0a6
Successfully built preprocess-kgptalkie
Installing collected packages: preprocess-kgptalkie
Successfully installed preprocess-kgptalkie-0.1.3


Defining get_clean function which is taking argument as ‘Reviews’ column then after performing some steps:

In [None]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import unidecode
import preprocess_kgptalkie as ps

stop_words = set(stopwords.words('english') + stopwords.words('spanish'))
lemmer = WordNetLemmatizer()

# A nice preprocessing function that we can pass to CountVectorizer/TfidfVectorizer
def my_preprocess(doc):

    # Lowercase everything
    res = doc.lower()
    
    # Remove any "weird" characters
    res = unidecode.unidecode(res)

    # TODO: What else?

    res = re.sub(r'[^\w\s]', '',res)

    res = re.sub(r'\d+', '', res)
    res = ps.remove_emails(res)
    res = ps.remove_urls(res)
    res = ps.remove_html_tags(res)
    res = ps.remove_accented_chars(res)
    res = ps.remove_special_chars(res)

    res = [lemmer.lemmatize(w) for w in res.split() if w not in stop_words]


    return ' '.join(res)   

In [None]:
df['en_clean'] = df['en_clean'].apply(my_preprocess)
df.head(10)

Unnamed: 0,en_clean,defaulted
0,opened colmado side house dominican republic c...,0
1,first loan joffre continues run locksmith moto...,1
2,dina santana mother two child julio manuel san...,0
3,rosemary year old single child grandchild join...,1
4,segundo shop sell animal feed medicine live po...,0
5,single parent mother sell grocery earn living ...,0
6,mariana jose serda owns general store magdalen...,0
7,mary mother four child one child secondary sch...,1
8,wilson seeking second loan kiva mifex first lo...,0
9,clara life two child age small town samana wan...,0


In [None]:
from sklearn.model_selection import train_test_split

X = df['en_clean']
y = df['defaulted']

# So that we can evaluate how well our model is performing, we split our training data
# into training and validation.

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state=42)

In [None]:
# These functions will calculate additional features on the document.
# They will be put into the Pipeline, called via the FunctionTransformer() function.
# Each one takes an entire corpus (as a list of documents), and should return
# an array of feature values (one for each document in the corpus).
# These functions can do anything they want; I've made most of them quick
# one-liners Hopefully the names of the functions will make them self explanitory.

def doc_length(corpus):
    return np.array([len(doc) for doc in corpus]).reshape(-1, 1)

def num_exclamation_marks(corpus):
    return np.array([doc.count('!') for doc in corpus]).reshape(-1, 1)

In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import FunctionTransformer

# This vectorizer will be used to create the BOW features.
vectorizer = TfidfVectorizer(preprocessor=my_preprocess, 
                             max_features = 100, 
                             use_idf=True)

rf = RandomForestClassifier(criterion='entropy', random_state=223)

# We will "union" together the BOW features and the custom-created features we
# created in the cell above.
feature_processing =  FeatureUnion([ 
    ('bow', Pipeline([('vectorizer', vectorizer), ])),
    ('doc_length', FunctionTransformer(doc_length, validate=False)),
    ('num_exclamation_marks', FunctionTransformer(num_exclamation_marks, validate=False)),  
])

pipe = Pipeline([('features', feature_processing), ('clf', rf)])

In [None]:
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV

param_grid = {
    'features__bow__vectorizer__max_features': [500, 1000,3000,4000,5000,6000,7000,8000],
    'features__bow__vectorizer__use_idf': [True, False],
    'clf__n_estimators': [10, 100, 150, 200, 300],
}
cv = StratifiedKFold(n_splits=8)
search = RandomizedSearchCV(pipe, param_distributions=param_grid, cv=cv,  scoring='f1_macro', return_train_score=True, verbose=8)
search = search.fit(X_train, y_train)

Fitting 8 folds for each of 10 candidates, totalling 80 fits
[CV] features__bow__vectorizer__use_idf=False, features__bow__vectorizer__max_features=500, clf__n_estimators=200 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  features__bow__vectorizer__use_idf=False, features__bow__vectorizer__max_features=500, clf__n_estimators=200, score=(train=1.000, test=0.853), total=  10.4s
[CV] features__bow__vectorizer__use_idf=False, features__bow__vectorizer__max_features=500, clf__n_estimators=200 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   13.2s remaining:    0.0s


[CV]  features__bow__vectorizer__use_idf=False, features__bow__vectorizer__max_features=500, clf__n_estimators=200, score=(train=1.000, test=0.820), total=  10.5s
[CV] features__bow__vectorizer__use_idf=False, features__bow__vectorizer__max_features=500, clf__n_estimators=200 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   26.5s remaining:    0.0s


[CV]  features__bow__vectorizer__use_idf=False, features__bow__vectorizer__max_features=500, clf__n_estimators=200, score=(train=1.000, test=0.812), total=  10.4s
[CV] features__bow__vectorizer__use_idf=False, features__bow__vectorizer__max_features=500, clf__n_estimators=200 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   39.7s remaining:    0.0s


[CV]  features__bow__vectorizer__use_idf=False, features__bow__vectorizer__max_features=500, clf__n_estimators=200, score=(train=1.000, test=0.843), total=  10.5s
[CV] features__bow__vectorizer__use_idf=False, features__bow__vectorizer__max_features=500, clf__n_estimators=200 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   53.0s remaining:    0.0s


[CV]  features__bow__vectorizer__use_idf=False, features__bow__vectorizer__max_features=500, clf__n_estimators=200, score=(train=1.000, test=0.836), total=  10.4s
[CV] features__bow__vectorizer__use_idf=False, features__bow__vectorizer__max_features=500, clf__n_estimators=200 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.1min remaining:    0.0s


[CV]  features__bow__vectorizer__use_idf=False, features__bow__vectorizer__max_features=500, clf__n_estimators=200, score=(train=1.000, test=0.848), total=  10.4s
[CV] features__bow__vectorizer__use_idf=False, features__bow__vectorizer__max_features=500, clf__n_estimators=200 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  1.3min remaining:    0.0s


[CV]  features__bow__vectorizer__use_idf=False, features__bow__vectorizer__max_features=500, clf__n_estimators=200, score=(train=1.000, test=0.828), total=  10.4s
[CV] features__bow__vectorizer__use_idf=False, features__bow__vectorizer__max_features=500, clf__n_estimators=200 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  1.5min remaining:    0.0s


[CV]  features__bow__vectorizer__use_idf=False, features__bow__vectorizer__max_features=500, clf__n_estimators=200, score=(train=1.000, test=0.836), total=  10.4s
[CV] features__bow__vectorizer__use_idf=True, features__bow__vectorizer__max_features=6000, clf__n_estimators=150 
[CV]  features__bow__vectorizer__use_idf=True, features__bow__vectorizer__max_features=6000, clf__n_estimators=150, score=(train=1.000, test=0.849), total=   7.9s
[CV] features__bow__vectorizer__use_idf=True, features__bow__vectorizer__max_features=6000, clf__n_estimators=150 
[CV]  features__bow__vectorizer__use_idf=True, features__bow__vectorizer__max_features=6000, clf__n_estimators=150, score=(train=1.000, test=0.826), total=   7.8s
[CV] features__bow__vectorizer__use_idf=True, features__bow__vectorizer__max_features=6000, clf__n_estimators=150 
[CV]  features__bow__vectorizer__use_idf=True, features__bow__vectorizer__max_features=6000, clf__n_estimators=150, score=(train=1.000, test=0.817), total=   7.8s
[CV

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed: 17.4min finished


In [None]:
print("Best parameter (CV score: %0.5f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score: 0.84217):
{'features__bow__vectorizer__use_idf': False, 'features__bow__vectorizer__max_features': 3000, 'clf__n_estimators': 300}


In [None]:
# Print out the results of hyperparmater tuning

def cv_results_to_df(cv_results):
    results = pd.DataFrame(list(cv_results['params']))
    results['mean_fit_time'] = cv_results['mean_fit_time']
    results['mean_score_time'] = cv_results['mean_score_time']
    results['mean_train_score'] = cv_results['mean_train_score']
    results['std_train_score'] = cv_results['std_train_score']
    results['mean_test_score'] = cv_results['mean_test_score']
    results['std_test_score'] = cv_results['std_test_score']
    results['rank_test_score'] = cv_results['rank_test_score']

    results = results.sort_values(['mean_test_score'], ascending=False)
    return results

results = cv_results_to_df(search.cv_results_)
results

Unnamed: 0,features__bow__vectorizer__use_idf,features__bow__vectorizer__max_features,clf__n_estimators,mean_fit_time,mean_score_time,mean_train_score,std_train_score,mean_test_score,std_test_score,rank_test_score
9,False,3000,300,12.097403,0.461868,0.99965,0.000165,0.842172,0.013192,1
5,False,1000,300,13.194972,0.450079,0.99965,0.000165,0.840127,0.015001,2
3,False,5000,300,11.799272,0.47223,0.99965,0.000165,0.838895,0.016378,3
6,False,5000,200,8.741837,0.435996,0.99965,0.000165,0.836837,0.015203,4
1,True,6000,150,7.360796,0.427759,0.99965,0.000165,0.835951,0.012416,5
2,False,3000,150,7.400777,0.428838,0.99965,0.000165,0.835357,0.013738,6
7,True,7000,200,8.833508,0.478794,0.99965,0.000165,0.835332,0.01437,7
8,False,4000,200,8.834128,0.438735,0.99965,0.000165,0.835248,0.014836,8
0,False,500,200,9.993393,0.426367,0.99965,0.000165,0.834454,0.012989,9
4,False,7000,200,8.740062,0.439787,0.99965,0.000165,0.83391,0.014678,10


In [None]:
y_val_pred = search.predict(X_val)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

print(confusion_matrix(y_true = y_val, y_pred = y_val_pred))

class_names = [str(x) for x in search.best_estimator_.classes_]
print(classification_report(y_true = y_val, y_pred = y_val_pred, target_names=class_names))

[[527  65]
 [120 514]]
              precision    recall  f1-score   support

           0       0.81      0.89      0.85       592
           1       0.89      0.81      0.85       634

    accuracy                           0.85      1226
   macro avg       0.85      0.85      0.85      1226
weighted avg       0.85      0.85      0.85      1226



# **CONCLUSIONS: dropping loan_id didn't help a lot.**