# NTT-DATA Merit Prize

In [1]:
from tfidf_extractor import load_and_vectorize, write_to_file
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV, cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
import os
import pandas as pd


In [2]:
PROCESSED = "../data/processed/"
RAW = "../data/raw/"
RANDOM_STATE = 42

## Preprocessing and TF-IDF encoding

In [3]:
raw_files = [
    f"{RAW}train.csv",
    f"{RAW}test.csv",
    f"{RAW}validation.csv"
]

processed_files = [
    f"{PROCESSED}train_features.csv",
    f"{PROCESSED}test_features.csv",
    f"{PROCESSED}validation_features.csv"
]

In [4]:
# Check if processed files exist; if not, process raw data
if not all(os.path.exists(file) for file in processed_files):
    X_train, X_test, X_val, y_train, y_test, y_val = load_and_vectorize(
        train_path=raw_files[0],
        test_path=raw_files[1],
        validation_path=raw_files[2],
        output_train_path=processed_files[0],
        output_test_path=processed_files[1],
        output_validation_path=processed_files[2]
    )

    # Save processed data
    write_to_file(X_train, y_train, processed_files[0])
    write_to_file(X_test, y_test, processed_files[1])
    write_to_file(X_val, y_val, processed_files[2])

In [5]:
# Load processed data
df_train = pd.read_csv(processed_files[0])
df_test = pd.read_csv(processed_files[1])
df_val = pd.read_csv(processed_files[2])

### K-fold cross validation

In [6]:
# K-fold cross validation on the training set
X = df_train.drop(columns=['Label'])
y = df_train['Label']

n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)

In [7]:
X.shape

(50587, 5000)

In [8]:
scoring = {
    'accuracy': 'accuracy',
    'precision_macro': 'precision_macro',
    'recall_macro': 'recall_macro',
    'f1_macro': 'f1_macro'
}

### Decision Tree
To extract the best performance out of the decision tree model we are going to tune the following hyperparameters:
* `min_sample_split` - minimum number of observations to split a node.
* `max_depth` - max depth of the tree

In [None]:
param_grid = {
    'max_depth': [5, 10, 20, 50, 100, None],
    'min_samples_split': [2, 5, 10, 20]
}

grid = GridSearchCV(
    DecisionTreeClassifier(random_state=RANDOM_STATE),
    param_grid=param_grid,
    scoring=scoring,
    refit='accuracy',
    cv=kf,
    return_train_score=False,
    n_jobs=-1
)

grid.fit(X, y)

In [None]:
for i in range(len(grid.cv_results_['params'])):
    params = grid.cv_results_['params'][i]
    line = f"DecisionTree(max_depth={params['max_depth']}, min_samples_split={params['min_samples_split']})"
    metrics = " | ".join([f"{metric}={grid.cv_results_[f'test_{metric}'][i]:.3f}" for metric in scoring])
    print(line, "|", metrics)

print("Best params (by accuracy):", grid.best_params_)
print("Best accuracy:", grid.best_score_)