In [1]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test_nolabel.csv')

## Exploración y limpieza del dataset

Empezamos haciendo una exploración del dataset. Para ello hemos probado la libreria `dtale`.

In [3]:
# import dtale
# dtale.show(train, open_browser=True)

In [4]:
drop = ['LoanNr_ChkDgt', 'id', 'State']

Tenemos que conseguir que todas las variables sean nuúmericas: `int` o `float`. Además agrupamos datos y tratamos de corregir los datos incorrectos.

In [5]:
train['ApprovalDate'] = pd.to_datetime(train['ApprovalDate'], format='%d-%b-%y')

train['NewExist'] = train['NewExist'].fillna(0).astype(int)

train['FranchiseCode'] = train['FranchiseCode'].astype(str)
train['FranchiseCode'] = train['FranchiseCode'].apply(lambda x: 0 if x in {0, 1} else 1).astype(int)


train['RevLineCr'] = train['RevLineCr'].apply(lambda x: '2' if x not in {'Y', 'N'} else ('1' if x == 'Y' else '0')).astype(int)

train['LowDoc'] = train['LowDoc'].apply(lambda x: '2' if x not in {'Y', 'N'} else ('1' if x == 'Y' else '0')).astype(int)

train['DisbursementDate'] = pd.to_datetime(train['DisbursementDate'], format='%d-%b-%y')

train[['DisbursementGross', 'BalanceGross']] = train[['DisbursementGross', 'BalanceGross']].replace({r'\$': '', ',': ''}, regex=True).astype(float)

Lo mismo con el test

In [6]:
test['ApprovalDate'] = pd.to_datetime(test['ApprovalDate'], format='%d-%b-%y')

test['NewExist'] = test['NewExist'].fillna(0).astype(int)

test['FranchiseCode'] = test['FranchiseCode'].astype(str)
test['FranchiseCode'] = test['FranchiseCode'].apply(lambda x: 0 if x in {0, 1} else 1).astype(int)

test['RevLineCr'] = test['RevLineCr'].apply(lambda x: '2' if x not in {'Y', 'N'} else ('1' if x == 'Y' else '0')).astype(int)

test['LowDoc'] = test['LowDoc'].apply(lambda x: '2' if x not in {'Y', 'N'} else ('1' if x == 'Y' else '0')).astype(int)

test['DisbursementDate'] = pd.to_datetime(test['DisbursementDate'], format='%d-%b-%y')

test[['DisbursementGross', 'BalanceGross']] = test[['DisbursementGross', 'BalanceGross']].replace({r'\$': '', ',': ''}, regex=True).astype(float)

Nuestro modelo dificilmente aprenderá directamente de las fechas. Transformaremos esta información en: año, trimestre

In [7]:
train['ApprovalYear'] = train['ApprovalDate'].dt.year
train['ApprovalQuarter'] = train['ApprovalDate'].dt.quarter
train['DisbursementYear'] = train['DisbursementDate'].dt.year
train['DisbursementQuarter'] = train['DisbursementDate'].dt.quarter
train['DaysToDisbursement'] = (train['DisbursementDate'] - train['ApprovalDate']).dt.days

train = train.drop(columns=['ApprovalDate', 'DisbursementDate'])

test['ApprovalYear'] = test['ApprovalDate'].dt.year
test['ApprovalQuarter'] = test['ApprovalDate'].dt.quarter
test['DisbursementYear'] = test['DisbursementDate'].dt.year
test['DisbursementQuarter'] = test['DisbursementDate'].dt.quarter
test['DaysToDisbursement'] = (test['DisbursementDate'] - test['ApprovalDate']).dt.days

test = test.drop(columns=['ApprovalDate', 'DisbursementDate'])

Vamos, también, a intentar sacar la información importante de los nombres.

In [8]:
import re
from sklearn.preprocessing import LabelEncoder
name_counts = train['Name'].value_counts()

def categorize_company(name):
    name = str(name).upper()
    if pd.isna(name):
        return "Other"
    name = name.upper().strip()
    
    if re.search(r'\b(CORP(ORATION)?|INC(ORPORATED)?|CO|COMPANY)\b', name):
        return "Corporation"
    elif re.search(r'\b(L\.?L\.?C\.?|LIMITED|LTD|L\.?T\.?D\.?)\b', name):
        return "Limited"
    elif re.search(r'\b(CHURCH|FOUNDATION|ASSOCIATION|NONPROFIT|CLUB)\b', name):
        return "NonProfit"
    elif re.search(r'\b(CITY|COUNTY|STATE|SCHOOL|UNIVERSITY|GOV(ERNMENT)?|BOARD)\b', name):
        return "Government"
    else:
        return "Other"
    
def refine_corporation(name):
    name = str(name).upper()
    if "CORP" in name:
        return "CORP"
    elif "INC" in name:
        return "INC"
    elif "CO" in name:
        return "CO"
    elif "COMPANY" in name:
        return "COMPANY"
    else:
        return "Other"
    
def refine_limited(name):
    name = str(name).upper()
    if "LLC" in name:
        return "LLC"
    elif "LTD" in name:
        return "LTD"
    elif "LIMITED" in name:
        return "LIMITED"
    else:
        return "Other"

train['CompanyType'] = train['Name'].apply(categorize_company)
train['CorpType'] = train.apply(lambda row: refine_corporation(row['Name']) if row['CompanyType'] == 'Corporation' else 'Not_Corp', axis=1)
train['LtdType'] = train.apply(lambda row: refine_limited(row['Name']) if row['CompanyType'] == 'Limited' else 'Not_Ltd', axis=1)

test['CompanyType'] = test['Name'].apply(categorize_company)
test['CorpType'] = test.apply(lambda row: refine_corporation(row['Name']) if row['CompanyType'] == 'Corporation' else 'Not_Corp', axis=1)
test['LtdType'] = test.apply(lambda row: refine_limited(row['Name']) if row['CompanyType'] == 'Limited' else 'Not_Ltd', axis=1)

train['CompanyType'] = LabelEncoder().fit_transform(train['CompanyType'])
train['CorpType'] = LabelEncoder().fit_transform(train['CorpType'])
train['LtdType'] = LabelEncoder().fit_transform(train['LtdType'])

test['CompanyType'] = LabelEncoder().fit_transform(test['CompanyType'])
test['CorpType'] = LabelEncoder().fit_transform(test['CorpType'])
test['LtdType'] = LabelEncoder().fit_transform(test['LtdType'])

# train = train.drop(columns='Name')
# test = test.drop(columns='Name')

Agrupamos también las ciudades y estados:

In [9]:
from sklearn.preprocessing import LabelEncoder

contador_ciudades = train['City'].value_counts()
contador_bank_states = train['BankState'].value_counts()
umbral = 60

otras = contador_ciudades[contador_ciudades < umbral].index
train['City'] = train['City'].replace(otras, 'OTHER_CITY')
test['City'] = test['City'].replace(otras, 'OTHER_CITY')

otras_bank_states = contador_bank_states[contador_bank_states < umbral].index
train['BankState'] = train['BankState'].replace(otras_bank_states, 'OTHER_BANKSTATE')
test['BankState'] = test['BankState'].replace(otras_bank_states, 'OTHER_BANKSTATE')

train['City'] = LabelEncoder().fit_transform(train['City'])
train['BankState'] = LabelEncoder().fit_transform(train['BankState'])
test['City'] = LabelEncoder().fit_transform(test['City'])
test['BankState'] = LabelEncoder().fit_transform(test['BankState'])

In [10]:
# train.dtypes
# train.isna().sum()
# test.dtypes
# test.isna().sum()

## Idea

In [11]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

n_clusters = 10
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
model_sbert = SentenceTransformer('all-MiniLM-L6-v2')

train['Name'] = train['Name'].fillna('').str.upper().str.strip()
X_embed = model_sbert.encode(train['Name'].tolist(), show_progress_bar=True)
train['Cluster'] = kmeans.fit_predict(X_embed)

test['Name'] = test['Name'].fillna('').str.upper().str.strip()
X_embed_test = model_sbert.encode(test['Name'].tolist(), show_progress_bar=True)
test['Cluster'] = kmeans.predict(X_embed_test)

train['Bank'] = train['Bank'].fillna('').str.upper().str.strip()
X_embed_bank = model_sbert.encode(train['Bank'].tolist(), show_progress_bar=True)
train['Cluster-Bank'] = kmeans.fit_predict(X_embed_bank)

test['Bank'] = test['Bank'].fillna('').str.upper().str.strip()
X_embed_bank_test = model_sbert.encode(test['Bank'].tolist(), show_progress_bar=True)
test['Cluster-Bank'] = kmeans.predict(X_embed_bank_test)

train = train.drop(columns=['Name', 'Bank'])
test = test.drop(columns=['Name', 'Bank'])

  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 714/714 [00:07<00:00, 97.17it/s] 
Batches: 100%|██████████| 103/103 [00:01<00:00, 100.64it/s]
Batches: 100%|██████████| 714/714 [00:06<00:00, 106.61it/s]
Batches: 100%|██████████| 103/103 [00:00<00:00, 114.11it/s]


## Arbol

In [12]:
X = train.drop(drop + ['Accept'], axis=1)
y = train['Accept']

In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate

model = DecisionTreeClassifier(
    max_depth=10,
    min_samples_split=20,
    max_features='sqrt',
    min_samples_leaf=5,
    class_weight='balanced',
    random_state=42,
)

cv_results = cross_validate(model, X, y, cv=7, scoring='f1', n_jobs=-1, return_train_score=False, return_estimator=True)

In [None]:
# predictions_best_model = models[ np.argmax(f1_scores) ].predict(test.drop(drop, axis=1))
# submission = pd.DataFrame({
#     'id': test['id'],
#     'Accept': predictions_best_model
# })
# submission.to_csv('tree_best_model.csv', index=False)

In [15]:
import numpy as np
models = cv_results['estimator']
predictions_ensemble = np.array([model.predict(test.drop(drop, axis=1)) for model in models])
final_preds = [1 if np.sum(predictions_ensemble[:, i]) > (predictions_ensemble.shape[0] / 2) else 0 for i in range(predictions_ensemble.shape[1])]
submission = pd.DataFrame({
    'id': test['id'],
    'Accept': final_preds
})
submission.to_csv('tree_ensemble.csv', index=False)