# Kiva Project

Predict whether a Kiva loan application will default.

# Preliminaries: Inspect and Set up environment

In [None]:
!which python

/usr/local/bin/python


In [None]:
!python --version

Python 3.7.10


In [None]:
!echo $PYTHONPATH

/env/python


In [None]:
pip install unidecode textstat

Collecting unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/9e/25/723487ca2a52ebcee88a34d7d1f5a4b80b793f179ee0f62d5371938dfa01/Unidecode-1.2.0-py2.py3-none-any.whl (241kB)
[K     |████████████████████████████████| 245kB 6.7MB/s 
[?25hCollecting textstat
[?25l  Downloading https://files.pythonhosted.org/packages/ca/b1/ab40a00b727a0d209402d1be6aa3f1bc75bd03678b59ace8507b08bf12f5/textstat-0.7.0-py3-none-any.whl (99kB)
[K     |████████████████████████████████| 102kB 8.4MB/s 
[?25hCollecting pyphen
[?25l  Downloading https://files.pythonhosted.org/packages/7c/5a/5bc036e01389bc6a6667a932bac3e388de6e7fa5777a6ff50e652f60ec79/Pyphen-0.10.0-py3-none-any.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 18.4MB/s 
[?25hInstalling collected packages: unidecode, pyphen, textstat
Successfully installed pyphen-0.10.0 textstat-0.7.0 unidecode-1.2.0


In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
import pandas as pd
import numpy as np

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import string

In [None]:
import datetime
print(datetime.datetime.now())

2021-04-11 07:14:49.128093


In [None]:
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Read Data

In [None]:
# The labeled training data
df = pd.read_csv("https://drive.google.com/uc?export=download&id=1dzzVbgHphbCf7kvq9IKiIhwzmxPbuH4s")

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6138 entries, 0 to 6137
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   loan_id    6138 non-null   int64 
 1   en_clean   6138 non-null   object
 2   defaulted  6138 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 144.0+ KB


# EDA

In [None]:
df.head()

Unnamed: 0,loan_id,en_clean,defaulted
0,7779,She opened a colmado out of the side of her ho...,0
1,2777,(First Loan): Joffre continues to run his loc...,1
2,6007,"Dina Santana is the mother of two children, Ju...",0
3,76,"Rosemary is 50 years old, single, and has 6 ch...",1
4,4217,"Segundo has a shop where he sells animal feed,...",0


In [None]:
df['defaulted'].value_counts()

0    3102
1    3036
Name: defaulted, dtype: int64

In [None]:
df = df.drop_duplicates('en_clean')
df.shape

(6129, 3)

In [None]:
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")

In [None]:
# We can use spacy to show all the named entities in a given document.

doc = nlp(df.iloc[4001].en_clean)
displacy.render(doc, style="ent", jupyter=True)

Add Text Augmentation

In [None]:
!pip install textaugment
nltk.download('averaged_perceptron_tagger')
from textaugment import Wordnet
from tqdm import tqdm
from sklearn.utils import shuffle

Collecting textaugment
  Downloading https://files.pythonhosted.org/packages/2c/63/9960414280dba3d9eba332502231d69fdc8ba664a4bd3d46842ba8cf0ef2/textaugment-1.3.4-py3-none-any.whl
Collecting googletrans
  Downloading https://files.pythonhosted.org/packages/71/3a/3b19effdd4c03958b90f40fe01c93de6d5280e03843cc5adf6956bfc9512/googletrans-3.0.0.tar.gz
Collecting httpx==0.13.3
[?25l  Downloading https://files.pythonhosted.org/packages/54/b4/698b284c6aed4d7c2b4fe3ba5df1fcf6093612423797e76fbb24890dd22f/httpx-0.13.3-py3-none-any.whl (55kB)
[K     |████████████████████████████████| 61kB 3.3MB/s 
Collecting hstspreload
[?25l  Downloading https://files.pythonhosted.org/packages/dd/50/606213e12fb49c5eb667df0936223dcaf461f94e215ea60244b2b1e9b039/hstspreload-2020.12.22-py3-none-any.whl (994kB)
[K     |████████████████████████████████| 1.0MB 7.2MB/s 
Collecting sniffio
  Downloading https://files.pythonhosted.org/packages/52/b0/7b2e028b63d092804b6794595871f936aafa5e9322dcaaad50ebf67445b3/sniffio-1.

True

In [None]:
def augment_text(df,samples=1250):
    t = Wordnet()
    new_text_majority=[]
    
    ##selecting the majority class samples
    df_n=df[df.defaulted==1].reset_index(drop=True)

    ## data augmentation loop
    for i in tqdm(np.random.randint(0,len(df_n),samples)):
        
            text = df_n.iloc[i]['en_clean']
            augmented_text = t.augment(text)
            new_text_majority.append(augmented_text)
    
    ## dataframe
    new=pd.DataFrame({'en_clean':new_text_majority,'defaulted':1})
    df=shuffle(df.append(new).reset_index(drop=True))


    new_text_minority=[]

    ##selecting the minority class samples
    df_n=df[df.defaulted==0].reset_index(drop=True)

    ## data augmentation loop
    for i in tqdm(np.random.randint(0,len(df_n),samples)):
        
            text = df_n.iloc[i]['en_clean']
            augmented_text = t.augment(text)
            new_text_minority.append(augmented_text)
    

    ## dataframe
    new=pd.DataFrame({'en_clean':new_text_minority,'defaulted':0})
    df=shuffle(df.append(new).reset_index(drop=True))
    return df

In [None]:
df = augment_text(df)

100%|██████████| 1250/1250 [00:12<00:00, 96.76it/s] 
100%|██████████| 1250/1250 [00:09<00:00, 130.68it/s]


In [None]:
df['defaulted'].value_counts()

0    4346
1    4283
Name: defaulted, dtype: int64

In [None]:
# See if there are null values
df.isnull().sum()

loan_id      2500
en_clean        0
defaulted       0
dtype: int64

In [None]:
# Drop the NaN values
df.dropna(inplace=True)
df.isnull().sum()

loan_id      0
en_clean     0
defaulted    0
dtype: int64

Add Sentiment Label Score Feature using Vander package

In [None]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True



In [None]:
a = 'This was a good movie.'
sid.polarity_scores(a)


{'compound': 0.4404, 'neg': 0.0, 'neu': 0.508, 'pos': 0.492}

In [None]:
df['scores'] = df['en_clean'].apply(lambda review: sid.polarity_scores(review))


In [None]:
df['compound']  = df['scores'].apply(lambda score_dict: score_dict['compound'])
df['comp_score'] = df['compound'].apply(lambda c: 1 if c >=0 else 0)

df.head(20)

Unnamed: 0,loan_id,en_clean,defaulted,scores,compound,comp_score
773,1254.0,I am a married father of 4. I am the main brea...,1,"{'neg': 0.029, 'neu': 0.845, 'pos': 0.126, 'co...",0.6908,1
5163,2874.0,Graciela is an experienced entrepreneur and mo...,1,"{'neg': 0.0, 'neu': 0.821, 'pos': 0.179, 'comp...",0.9517,1
928,3894.0,Don Walter has successfully repaid his first l...,0,"{'neg': 0.023, 'neu': 0.85, 'pos': 0.127, 'com...",0.9786,1
3921,5988.0,Jos has only worked in agriculture; he has had...,0,"{'neg': 0.0, 'neu': 0.846, 'pos': 0.154, 'comp...",0.9817,1
1505,1555.0,Judith Wathithi Mikori is 37 years old and mar...,1,"{'neg': 0.0, 'neu': 0.94, 'pos': 0.06, 'compou...",0.7783,1
736,518.0,Lilian is a young mother who was married off a...,1,"{'neg': 0.0, 'neu': 0.914, 'pos': 0.086, 'comp...",0.5423,1
2482,4783.0,Rhoda is married with two children. Both have ...,0,"{'neg': 0.0, 'neu': 0.829, 'pos': 0.171, 'comp...",0.9538,1
2998,7095.0,Margarita was born and raised in the Ochocient...,0,"{'neg': 0.0, 'neu': 0.943, 'pos': 0.057, 'comp...",0.6486,1
7131,6275.0,Elba is 59 years old. She has seven children ...,0,"{'neg': 0.029, 'neu': 0.922, 'pos': 0.049, 'co...",0.3612,1
3549,6329.0,"Sonia lives in Batey, the migrant camps which ...",0,"{'neg': 0.0, 'neu': 0.915, 'pos': 0.085, 'comp...",0.8555,1


In [None]:
import spacy
from functools import partial
def spacy_tokenize(text, nlp):
    return [x.orth_ for x in nlp(text)]

nlp = spacy.load('en', disable=['ner', 'parser', 'tagger'])
tok = partial(spacy_tokenize, nlp=nlp)

In [None]:
#funtion to get 'top N' or 'bottom N' words
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
def get_n_words(corpus, direction, n):
    vec = CountVectorizer(stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    if direction == "top":
        words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    else:
        words_freq =sorted(words_freq, key = lambda x: x[1], reverse=False)
    return words_freq[:n]

In [None]:
#10 most common and 10 most rare words
common_words = get_n_words(df['en_clean'], "top", 15)
rare_words = get_n_words(df['en_clean'], "bottom", 800)
#common_words, rare_words

In [None]:
#Removing common and rare words
#%time df['en_clean'] = df['en_clean'].map(lambda x : ' '.join([w for w in x.split() if w not in common_words]))
%time df['en_clean'] = df['en_clean'].map(lambda x : ' '.join([w for w in x.split() if w not in rare_words]))

CPU times: user 15.1 s, sys: 56 ms, total: 15.2 s
Wall time: 15.1 s


In [None]:
df.head(10)

Unnamed: 0,loan_id,en_clean,defaulted,scores,compound,comp_score
773,1254.0,I am a married father of 4. I am the main brea...,1,"{'neg': 0.029, 'neu': 0.845, 'pos': 0.126, 'co...",0.6908,1
5163,2874.0,Graciela is an experienced entrepreneur and mo...,1,"{'neg': 0.0, 'neu': 0.821, 'pos': 0.179, 'comp...",0.9517,1
928,3894.0,Don Walter has successfully repaid his first l...,0,"{'neg': 0.023, 'neu': 0.85, 'pos': 0.127, 'com...",0.9786,1
3921,5988.0,Jos has only worked in agriculture; he has had...,0,"{'neg': 0.0, 'neu': 0.846, 'pos': 0.154, 'comp...",0.9817,1
1505,1555.0,Judith Wathithi Mikori is 37 years old and mar...,1,"{'neg': 0.0, 'neu': 0.94, 'pos': 0.06, 'compou...",0.7783,1
736,518.0,Lilian is a young mother who was married off a...,1,"{'neg': 0.0, 'neu': 0.914, 'pos': 0.086, 'comp...",0.5423,1
2482,4783.0,Rhoda is married with two children. Both have ...,0,"{'neg': 0.0, 'neu': 0.829, 'pos': 0.171, 'comp...",0.9538,1
2998,7095.0,Margarita was born and raised in the Ochocient...,0,"{'neg': 0.0, 'neu': 0.943, 'pos': 0.057, 'comp...",0.6486,1
7131,6275.0,Elba is 59 years old. She has seven children a...,0,"{'neg': 0.029, 'neu': 0.922, 'pos': 0.049, 'co...",0.3612,1
3549,6329.0,"Sonia lives in Batey, the migrant camps which ...",0,"{'neg': 0.0, 'neu': 0.915, 'pos': 0.085, 'comp...",0.8555,1


# Train Test Split

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV

X = df['en_clean']
y = df['defaulted']

# So that we can evaluate how well our model is performing, we split our training data
# into training and validation.

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42)

In [None]:
X

773     I am a married father of 4. I am the main brea...
5163    Graciela is an experienced entrepreneur and mo...
928     Don Walter has successfully repaid his first l...
3921    Jos has only worked in agriculture; he has had...
1505    Judith Wathithi Mikori is 37 years old and mar...
                              ...                        
3623    Alexandra is 24 years old. She lives with her ...
1558    Natalia is thirty-nine years old and a single ...
5016    Flora Muthoni is 43 years old, married, with 6...
4581    Luisa Guillermina was born and raised in her s...
3374    This group hopes to use their first microfinan...
Name: en_clean, Length: 6129, dtype: object

# Feature Engineering and Extraction Pipeline

In [None]:
import re
import unidecode

# A nice preprocessing function that we can pass to CountVectorizer/TfidfVectorizer
def my_preprocess(doc):

    # Lowercase everything
    res = doc.lower()
    
    # Remove any "weird" characters
    res = unidecode.unidecode(res)

    #print(res)

    # TODO: What else?
    SYM_REPLACE = re.compile('[/(){}\[\]\|@,;]')
    REM_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
    res = SYM_REPLACE.sub(' ', res) 
    res = REM_SYMBOLS_RE.sub('', res) 

    # Create Lemmatizer object
    wnl = WordNetLemmatizer()
    list2 = nltk.word_tokenize(res)
    res = ' '.join([wnl.lemmatize(words) for words in list2])

    # Remove stopwords
    all_stopwords = nlp.Defaults.stop_words
    text_tokens = word_tokenize(res)
    tokens_without_sw = [word for word in text_tokens if not word in all_stopwords]
    res = (" ").join(tokens_without_sw)

    return res

In [None]:
# These functions will calculate additional features on the document.
# They will be put into the Pipeline, called via the FunctionTransformer() function.
# Each one takes an entire corpus (as a list of documents), and should return
# an array of feature values (one for each document in the corpus).
# These functions can do anything they want; I've made most of them quick
# one-liners Hopefully the names of the functions will make them self explanitory.

def doc_length(corpus):
    #print(corpus)
    return np.array([len(doc) for doc in corpus]).reshape(-1, 1)

def num_exclamation_marks(corpus):
    return np.array([doc.count('!') for doc in corpus]).reshape(-1, 1)

def count_loan(corpus):
    return np.array([doc.count('loan') for doc in corpus]).reshape(-1, 1)

def generate_sentiment_score(corpus):
  score_dict = [sid.polarity_scores(doc) for doc in corpus]
  score = []
  for key in score_dict:
    #print(key['compound'])
    score.append(key['compound'])

    # if key['compound'] >= 0:
    #   score.append(1)
    # else:
    #   score.append(1)
  return np.array(score).reshape(-1, 1)


In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

#0.86 - F1 macro using random forest
vectorizer = CountVectorizer(preprocessor=my_preprocess,  tokenizer=tok,min_df=2, 
                             max_df=0.5, max_features=1000,
                             stop_words='english', ngram_range=(1, 2))
# vectorizer = CountVectorizer(preprocessor=my_preprocess, min_df=2, 
#                              max_df=0.5, max_features=1000,
#                              ngram_range=(1, 2))
# This vectorizer will be used to create the BOW features.
# vectorizer = TfidfVectorizer(preprocessor=my_preprocess, 
#                              tokenizer=tok,
#                              max_features = 1000, 
#                              use_idf=True,
#                              stop_words='english',
#                              min_df=2, max_df=10, ngram_range=[1,2])

rf = RandomForestClassifier(criterion='entropy', random_state=1)

# We will "union" together the BOW features and the custom-created features we
# created in the cell above.
feature_processing =  FeatureUnion([ 
    ('bow', Pipeline([('vectorizer', vectorizer), ])),
    ('doc_length', FunctionTransformer(doc_length, validate=False)),
    ('num_exclamation_marks', FunctionTransformer(num_exclamation_marks, validate=False)),
    ('sentiment_score', FunctionTransformer(generate_sentiment_score, validate=False)),  
   # ('count_loan', FunctionTransformer(count_loan, validate=False)),  
])

pipe = Pipeline([('features', feature_processing), ('clf', rf)])

# Model Training/Tuning/Cross Validation


In [None]:
from sklearn.model_selection import GridSearchCV

# The names of the hypter parameters may look a bit funny; it's based on how they
# are added to the Pipeline object above (and seperated with double underscores)
# param_grid = {
#     'features__bow__vectorizer__max_features': [1000,1200, 1300, 1500],
#     'features__bow__vectorizer__use_idf': [True, False],
#     'clf__n_estimators': [10, 100,200],
#     'features__doc_length' : [True, False],
#     'features__num_exclamation_marks' : [True, False],
#     'features__sentiment_score' : [True, False],

# }

param_grid = {
    'features__bow__vectorizer__max_features': [500, 1000,1150,1200, 1300,2000,2500,3000,3500,4000,5000,6000,7000],
    'clf__n_estimators': [10, 100, 150, 200, 225, 250, 300, 350, 400, 450, 500],
    'features__bow__vectorizer__ngram_range': [(1, 2)],
}


#cv = StratifiedKFold(n_splits=8)
search = GridSearchCV(pipe, 
                      param_grid, 
                      cv=5, 
                      n_jobs=5, 
                      scoring='f1_macro', 
                      return_train_score=True, 
                      verbose=2)

search = search.fit(X_train, y_train)

Fitting 5 folds for each of 143 candidates, totalling 715 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:  4.6min
[Parallel(n_jobs=5)]: Done 152 tasks      | elapsed: 20.8min
[Parallel(n_jobs=5)]: Done 355 tasks      | elapsed: 49.7min
[Parallel(n_jobs=5)]: Done 638 tasks      | elapsed: 95.0min
[Parallel(n_jobs=5)]: Done 715 out of 715 | elapsed: 108.1min finished
  'stop_words.' % sorted(inconsistent))


In [None]:
print("Best parameter (CV score: %0.5f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score: 0.85470):
{'clf__n_estimators': 500, 'features__bow__vectorizer__max_features': 1000, 'features__bow__vectorizer__ngram_range': (1, 2)}


In [None]:
# param_grid = {
#     'features__bow__vectorizer__max_features': [500, 1000,1200,1500,2000],
#     'features__bow__vectorizer__use_idf': [True, False],
#     'clf__n_estimators': [100,200],
# }

param_grid = {
    'features__bow__vectorizer__max_features': [500, 1000,1150,1200, 1300,2000,2500,3000,3500,4000,5000,6000,7000],
    'clf__n_estimators': [10, 100, 150, 200, 225, 250, 300, 350, 400, 450, 500],
    'features__bow__vectorizer__ngram_range': [(1, 2)],
}

cv = StratifiedKFold(n_splits=8)
search = RandomizedSearchCV(pipe, param_distributions=param_grid, cv=cv, n_jobs=5,  scoring='f1_macro', return_train_score=True, verbose=8)
search = search.fit(X_train, y_train)

Fitting 8 folds for each of 10 candidates, totalling 80 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   8 tasks      | elapsed:  2.1min
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:  5.6min
[Parallel(n_jobs=5)]: Done  62 tasks      | elapsed: 10.6min
[Parallel(n_jobs=5)]: Done  80 out of  80 | elapsed: 13.1min finished
  'stop_words.' % sorted(inconsistent))


In [None]:
print("Best parameter (CV score: %0.5f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score: 0.85291):
{'features__bow__vectorizer__ngram_range': (1, 2), 'features__bow__vectorizer__max_features': 3000, 'clf__n_estimators': 400}


In [None]:
# Print out the results of hyperparmater tuning

def cv_results_to_df(cv_results):
    results = pd.DataFrame(list(cv_results['params']))
    results['mean_fit_time'] = cv_results['mean_fit_time']
    results['mean_score_time'] = cv_results['mean_score_time']
    results['mean_train_score'] = cv_results['mean_train_score']
    results['std_train_score'] = cv_results['std_train_score']
    results['mean_test_score'] = cv_results['mean_test_score']
    results['std_test_score'] = cv_results['std_test_score']
    results['rank_test_score'] = cv_results['rank_test_score']

    results = results.sort_values(['mean_test_score'], ascending=False)
    return results

results = cv_results_to_df(search.cv_results_)
results

Unnamed: 0,features__bow__vectorizer__ngram_range,features__bow__vectorizer__max_features,clf__n_estimators,mean_fit_time,mean_score_time,mean_train_score,std_train_score,mean_test_score,std_test_score,rank_test_score
4,"(1, 2)",3000,400,28.705133,2.256831,1.0,0.0,0.852909,0.012274,1
8,"(1, 2)",4000,300,25.884035,2.228614,1.0,0.0,0.852904,0.01277,2
6,"(1, 2)",1150,450,30.20605,2.240036,1.0,0.0,0.850738,0.011573,3
9,"(1, 2)",4000,100,19.716557,2.166978,1.0,0.0,0.85027,0.011007,4
7,"(1, 2)",2000,150,21.355717,2.194135,1.0,0.0,0.849423,0.010132,5
2,"(1, 2)",1200,150,21.327046,2.185,1.0,0.0,0.84916,0.009431,6
5,"(1, 2)",1300,225,23.432089,2.191255,1.0,0.0,0.848181,0.013668,7
0,"(1, 2)",1150,300,25.640726,2.216026,1.0,0.0,0.848084,0.01024,8
1,"(1, 2)",7000,100,19.800804,2.203108,1.0,0.0,0.84714,0.009028,9
3,"(1, 2)",7000,200,23.174191,2.227544,1.0,0.0,0.845568,0.011103,10


# Model Assessment

In [None]:
y_val_pred = search.predict(X_val)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

print(confusion_matrix(y_true = y_val, y_pred = y_val_pred))

class_names = [str(x) for x in search.best_estimator_.classes_]
print(classification_report(y_true = y_val, y_pred = y_val_pred, target_names=class_names))

[[702  81]
 [134 616]]
              precision    recall  f1-score   support

           0       0.84      0.90      0.87       783
           1       0.88      0.82      0.85       750

    accuracy                           0.86      1533
   macro avg       0.86      0.86      0.86      1533
weighted avg       0.86      0.86      0.86      1533



# Kaggle Predictions

In [None]:
# Read in the unlabeled testing data (for the Kaggle competition)
df_test = pd.read_csv("https://drive.google.com/uc?export=download&id=1EVWfyqQOd_W2uTKrr4JTD2iFrEZHoOHT")

In [None]:
# Use our pipeline to make predictions; then output predictions to a CSV file.

pred_test = search.predict(df_test['en_clean'])
my_submission = pd.DataFrame({'id': df_test['loan_id'], 'predicted': pred_test})
my_submission.head()

# This command will save the file to the local cloud instance; it will be deleted
# as soon as this Notebooks session ends.
my_submission.to_csv('my_submission.csv', index=False)

Unnamed: 0,id,predicted
0,6607,0
1,154,1
2,7402,0
3,2617,1
4,6464,0


In [None]:
# Download predictions file to your local computer

from google.colab import files
files.download('my_submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
print(datetime.datetime.now())

2021-04-11 09:18:56.541397


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.activity.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fexperimentsandconfigs%20https%3a%2f%2fwww.googleapis.com%2fauth%2fphotos.native&response_type=code

Enter your authorization code:
4/1AY0e-g7111wVtOvWqBBefOIjGo3qsoSZ98c233AuGVQ4AApkq42Zx8nSAcw
Mounted at /content/drive
