In [1]:
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from pandas import Series, DataFrame, read_csv, concat
from catboost import Pool, CatBoostClassifier
from numpy import array, ndarray
from typing import Tuple, List, Dict, Union, Optional

In [2]:
def get_vin(vin: str) -> str:
    return vin[0:3]


def get_vehicle_attrs(vin: str) -> str:
    return vin[3:10]


def vectorize_text(vectorizer: Union[CountVectorizer,HashingVectorizer,TfidfVectorizer],
                   X_train: Series,
                   X_valid: Series) -> Tuple[ndarray]:
    X_train_featurized = vectorizer.fit_transform(X_train)
    X_valid_featurized = vectorizer.transform(X_valid)
    return (X_train_featurized, X_valid_featurized)


def create_pools(X_train: ndarray,
                 y_train: array,
                 X_valid: ndarray,
                 y_valid: array) -> Tuple[Pool]:
    train = Pool(data=X_train, label=y_train)
    valid = Pool(data=X_valid, label=y_valid)
    return (train, valid)


def train_clf(train: Pool,
              learning_rate: float,
              depth: int,
              iterations: int,
              class_names: Optional[array] = None,
              valid: Optional[Pool] = None) -> CatBoostClassifier:
    inner_model = CatBoostClassifier(objective="MultiClass",
                                    learning_rate=learning_rate,
                                    depth=depth,
                                    iterations=iterations,
                                    random_seed=1,
                                    eval_metric="Accuracy",
                                    use_best_model=False,
                                    logging_level="Silent",
                                    class_names=class_names,
                                    task_type="GPU",
                                    devices="0:1")

    inner_model.fit(train, eval_set=valid)
    return inner_model


def nested_cv(X: Series,
              y: Series,
              class_names: array,
              vectorizers: List[Union[CountVectorizer,HashingVectorizer,TfidfVectorizer]],
              grid: Dict,
              n_splits: int = 5,
              shuffle: bool = True,
              random_state: int = 1) -> DataFrame:
    # TODO: Use hyperopt for param tuning
    cv_outer = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
    cv_inner = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

    outer_result = []
    outer_fold_idx = 1

    # Outer loop
    for outer_train_idx, outer_valid_idx in cv_outer.split(X, y):
        X_outer_train, X_outer_valid = X.iloc[outer_train_idx], X.iloc[outer_valid_idx]
        y_outer_train, y_outer_valid = y.iloc[outer_train_idx], y.iloc[outer_valid_idx]

        inner_fold_idx = 1
        all_inner_fold_results = []

        print(f"Running {outer_fold_idx} outer fold... \n ")

        # Inner loop
        for inner_train_idx, inner_valid_idx in cv_inner.split(X_outer_train, y_outer_train):
            X_inner_train, X_inner_valid = X.iloc[inner_train_idx], X.iloc[inner_valid_idx]
            y_inner_train, y_inner_valid = y.iloc[inner_train_idx], y.iloc[inner_valid_idx]

            inner_result = []

            print(f"Running {inner_fold_idx} inner fold... \n ")

            # Text-to-features loop
            for inner_vectorizer in vectorizers.keys():
                X_inner_train_featurized, X_inner_valid_featurized = vectorize_text(vectorizer=vectorizers[inner_vectorizer],
                                                                                    X_train=X_inner_train,
                                                                                    X_valid=X_inner_valid)

                # Hyperparam tuning loop
                for learning_rate in grid["learning_rate"]:
                    for depth in grid["depth"]:
                        for iterations in grid["iterations"]:
                            train_inner_pool, valid_inner_pool = create_pools(X_train=X_inner_train_featurized,
                                                                              y_train=y_inner_train,
                                                                              X_valid=X_inner_valid_featurized,
                                                                              y_valid=y_inner_valid)

                            inner_model = train_clf(train=train_inner_pool,
                                                    valid=valid_inner_pool,
                                                    learning_rate=learning_rate,
                                                    depth=depth,
                                                    iterations=iterations,
                                                    class_names=class_names)

                            inner_metrics = inner_model.eval_metrics(data=valid_inner_pool,
                                                                    metrics=["Accuracy"],
                                                                    ntree_start=iterations-1)

                            print(f"""Text to features method - {inner_vectorizer} | learning rate - {learning_rate} | depth - {depth} | iterations - {iterations} | Accuracy - {inner_metrics["Accuracy"][0]} \n\n""")

                            inner_result.append({"outer-fold": outer_fold_idx,
                                                "inner-fold": inner_fold_idx,
                                                "vectorizer": inner_vectorizer,
                                                "learning_rate": learning_rate,
                                                "depth": depth,
                                                "iterations": iterations,
                                                "accuracy": inner_metrics["Accuracy"][0]})
            inner_fold_idx += 1
            inner_result_df = DataFrame(inner_result)
            all_inner_fold_results.append(inner_result_df)

        all_inner_fold_results_df = concat(all_inner_fold_results)
        all_inner_fold_results_agg_df = (all_inner_fold_results_df
                                        .groupby(["vectorizer", "learning_rate", "depth", "iterations"])["accuracy"]
                                        .agg("mean")
                                        .reset_index()
                                        .rename(columns={"accuracy": "mean_accuracy"}))

        best_inner_params = all_inner_fold_results_agg_df[all_inner_fold_results_agg_df["mean_accuracy"] == all_inner_fold_results_agg_df["mean_accuracy"].max()].to_dict("records")[0]

        X_outer_train_featurized, X_outer_valid_featurized = vectorize_text(vectorizer=vectorizers[best_inner_params["vectorizer"]],
                                                                            X_train=X_outer_train,
                                                                            X_valid=X_outer_valid)

        train_outer_pool, valid_outer_pool = create_pools(X_train=X_outer_train_featurized,
                                                          y_train=y_outer_train,
                                                          X_valid=X_outer_valid_featurized,
                                                          y_valid=y_outer_valid)

        outer_model = train_clf(train=train_outer_pool,
                                valid=valid_outer_pool,
                                learning_rate=best_inner_params["learning_rate"],
                                depth=best_inner_params["depth"],
                                iterations=best_inner_params["iterations"],
                                class_names=class_names)

        # Outer model evaluation
        outer_metrics = outer_model.eval_metrics(data=valid_outer_pool,
                                                 metrics=["Accuracy"],
                                                 ntree_start=best_inner_params["iterations"] - 1)

        outer_result.append({"outer-fold": outer_fold_idx,
                             "vectorizer": best_inner_params["vectorizer"],
                             "learning_rate": best_inner_params["learning_rate"],
                             "depth": best_inner_params["depth"],
                             "iterations": best_inner_params["iterations"],
                             "accuracy": outer_metrics["Accuracy"][0]})

        outer_fold_idx += 1

    # Generalization performance estimation
    outer_folds_df = DataFrame(outer_result)
    return outer_folds_df

In [3]:
df = read_csv("../data/label_preprocessing/vin_model_pairs_w_labels.csv")
df

Unnamed: 0,vin,model
0,3MW5R1J0XM8CXXXXX,3 Series
1,4USBT33443LRXXXXX,Z Series
2,5UXCR4C06M9FXXXXX,X Series
3,5UXCY6C04P9PXXXXX,X Series
4,5UXCY6C04P9PXXXXX,X Series
...,...,...
1339,WUAZZZF38N19XXXXX,Q3
1340,WUAZZZF5XJA9XXXXX,A5
1341,WUAZZZF5XJA9XXXXX,A5
1342,WUAZZZFX9J79XXXXX,R8


According to [some article](https://www.autocheck.com/vehiclehistory/images/vin-decode.jpg) as well as [Wikipedia](https://en.wikipedia.org/wiki/Vehicle_identification_number#Components), most VINs have 2 parts that shouldn"t change despite the fact that though most manufacturers don"t follow the standard.

* Characters 1-3 capture WIM ([World Manufacturer identifier](https://en.wikipedia.org/wiki/Vehicle_identification_number#World_manufacturer_identifier))
* Characters 4-9 should encapsulate vehicle attributes

Everything else is a serial number of the vehicle or just irrelevant to the vehicle model classification problem. Based on this information, let"s build features below.

In [4]:
df["wmi"] = df["vin"].apply(get_vin)
df["vehicle_attrs"] = df["vin"].apply(get_vehicle_attrs)
df

Unnamed: 0,vin,model,wmi,vehicle_attrs
0,3MW5R1J0XM8CXXXXX,3 Series,3MW,5R1J0XM
1,4USBT33443LRXXXXX,Z Series,4US,BT33443
2,5UXCR4C06M9FXXXXX,X Series,5UX,CR4C06M
3,5UXCY6C04P9PXXXXX,X Series,5UX,CY6C04P
4,5UXCY6C04P9PXXXXX,X Series,5UX,CY6C04P
...,...,...,...,...
1339,WUAZZZF38N19XXXXX,Q3,WUA,ZZZF38N
1340,WUAZZZF5XJA9XXXXX,A5,WUA,ZZZF5XJ
1341,WUAZZZF5XJA9XXXXX,A5,WUA,ZZZF5XJ
1342,WUAZZZFX9J79XXXXX,R8,WUA,ZZZFX9J


Very few unique categories, hence we can skip sophisticated feature engineering methods.

In [5]:
df["wmi"].unique()

array(['3MW', '4US', '5UX', '5YM', 'TRU', 'WA1', 'WAU', 'WAV', 'WB1',
       'WBA', 'WBS', 'WBX', 'WBY', 'WUA', 'X4X'], dtype=object)

Methods considered:
* One hot encoding (won't try)
* Count Vectorizer (won't try)
* Ngrams (need to try this out)
* Hash Vectorizer (need to try this out)
* TF-IDF (need to try this out)

Rationale:

* One Hot Encoding doesn't take into consideration frequence of token occurence as well as preceding and following tokens. The latter should be presumably important given the nature of the task
* Count Vectorizer - don't consider preceding and following tokens. As it was mentioned, this should be very important for the VIN decoder task
* Ngrams - takes into account unigrams, bigrams, trigrams, hence worth looking into
* Hash Vectorizer - same as with Ngrams, but it takes on step further and applies hashing trick on top to reduce dimensionality
* TF-IDF - lacks preceeding and succeeding token awareness

In [6]:
corpus = df["vehicle_attrs"]
wmis = df["wmi"]
targets = df["model"]

In [7]:
class_names = targets.unique()
# vectorizers = {"Bigrams": CountVectorizer(ngram_range=(2,2), analyzer="char"),
#                "Hash vectorizer": HashingVectorizer(ngram_range=(2,2), analyzer="char"),
#                "TF-IDF": TfidfVectorizer(ngram_range=(2,2), analyzer="char")}
# grid = {"learning_rate": [0.1, 0.15],
#         "depth": [8, 10],
#         "iterations": [700, 900]}

vectorizers = {"Bigrams": CountVectorizer(ngram_range=(2,2), analyzer="char")}
grid = {"learning_rate": [0.15],
        "depth": [8, 10],
        "iterations": [700]}

outer_folds_df = nested_cv(X=corpus, y=targets, class_names=class_names, vectorizers=vectorizers, grid=grid, n_splits=3)
display(outer_folds_df)



Running 1 outer fold... 
 
Running 1 inner fold... 
 


Found only 24 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.
Found only 24 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.


Text to features method - Bigrams | learning rate - 0.15 | depth - 8 | iterations - 700 | Accuracy - 0.8662207357859532 




Found only 24 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.
Found only 24 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.
Found only 24 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.


Text to features method - Bigrams | learning rate - 0.15 | depth - 10 | iterations - 700 | Accuracy - 0.862876254180602 


Running 2 inner fold... 
 


Found only 24 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.
Found only 24 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.


Text to features method - Bigrams | learning rate - 0.15 | depth - 8 | iterations - 700 | Accuracy - 0.8561872909698997 




Found only 24 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.
Found only 22 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.
Label(s) 23, 15 are not present in the train set. Perhaps, something is wrong with the data.


Text to features method - Bigrams | learning rate - 0.15 | depth - 10 | iterations - 700 | Accuracy - 0.8528428093645485 


Running 3 inner fold... 
 


Found only 22 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.
Label(s) 23, 15 are not present in the train set. Perhaps, something is wrong with the data.


Text to features method - Bigrams | learning rate - 0.15 | depth - 8 | iterations - 700 | Accuracy - 0.8053691275167785 




Found only 22 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.
Label(s) 23, 15 are not present in the train set. Perhaps, something is wrong with the data.
Found only 22 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.
Label(s) 23, 15 are not present in the train set. Perhaps, something is wrong with the data.
Found only 25 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.
Label(s) 15 are not present in the train set. Perhaps, something is wrong with the data.


Text to features method - Bigrams | learning rate - 0.15 | depth - 10 | iterations - 700 | Accuracy - 0.8120805369127517 




Found only 25 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.
Label(s) 15 are not present in the train set. Perhaps, something is wrong with the data.
Found only 24 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.


Running 2 outer fold... 
 
Running 1 inner fold... 
 


Found only 24 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.


Text to features method - Bigrams | learning rate - 0.15 | depth - 8 | iterations - 700 | Accuracy - 0.862876254180602 




Found only 24 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.
Found only 24 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.
Found only 24 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.


Text to features method - Bigrams | learning rate - 0.15 | depth - 10 | iterations - 700 | Accuracy - 0.8595317725752508 


Running 2 inner fold... 
 


Found only 24 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.


Text to features method - Bigrams | learning rate - 0.15 | depth - 8 | iterations - 700 | Accuracy - 0.8762541806020067 




Found only 24 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.
Found only 24 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.
Found only 22 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.
Label(s) 23, 15 are not present in the train set. Perhaps, something is wrong with the data.


Text to features method - Bigrams | learning rate - 0.15 | depth - 10 | iterations - 700 | Accuracy - 0.8795986622073578 


Running 3 inner fold... 
 


Found only 22 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.
Label(s) 23, 15 are not present in the train set. Perhaps, something is wrong with the data.
Found only 22 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.
Label(s) 23, 15 are not present in the train set. Perhaps, something is wrong with the data.


Text to features method - Bigrams | learning rate - 0.15 | depth - 8 | iterations - 700 | Accuracy - 0.8825503355704698 




Found only 22 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.
Label(s) 23, 15 are not present in the train set. Perhaps, something is wrong with the data.


Text to features method - Bigrams | learning rate - 0.15 | depth - 10 | iterations - 700 | Accuracy - 0.8758389261744967 


Running 3 outer fold... 
 
Running 1 inner fold... 
 


Found only 24 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.
Found only 24 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.


Text to features method - Bigrams | learning rate - 0.15 | depth - 8 | iterations - 700 | Accuracy - 0.8494983277591973 




Found only 24 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.
Found only 24 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.
Found only 23 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.
Label(s) 23 are not present in the train set. Perhaps, something is wrong with the data.


Text to features method - Bigrams | learning rate - 0.15 | depth - 10 | iterations - 700 | Accuracy - 0.8494983277591973 


Running 2 inner fold... 
 


Found only 23 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.
Label(s) 23 are not present in the train set. Perhaps, something is wrong with the data.


Text to features method - Bigrams | learning rate - 0.15 | depth - 8 | iterations - 700 | Accuracy - 0.8561872909698997 




Found only 23 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.
Label(s) 23 are not present in the train set. Perhaps, something is wrong with the data.
Found only 23 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.
Label(s) 23 are not present in the train set. Perhaps, something is wrong with the data.
Found only 23 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.
Label(s) 15 are not present in the train set. Perhaps, something is wrong with the data.


Text to features method - Bigrams | learning rate - 0.15 | depth - 10 | iterations - 700 | Accuracy - 0.8595317725752508 


Running 3 inner fold... 
 


Found only 23 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.
Label(s) 15 are not present in the train set. Perhaps, something is wrong with the data.


Text to features method - Bigrams | learning rate - 0.15 | depth - 8 | iterations - 700 | Accuracy - 0.8389261744966443 




Found only 23 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.
Label(s) 15 are not present in the train set. Perhaps, something is wrong with the data.
Found only 23 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.
Label(s) 15 are not present in the train set. Perhaps, something is wrong with the data.
Found only 25 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.
Label(s) 24 are not present in the train set. Perhaps, something is wrong with the data.


Text to features method - Bigrams | learning rate - 0.15 | depth - 10 | iterations - 700 | Accuracy - 0.8389261744966443 




Found only 25 unique classes in the data, but have defined 26 classes. Probably something is wrong with data.
Label(s) 24 are not present in the train set. Perhaps, something is wrong with the data.


Unnamed: 0,outer-fold,vectorizer,learning_rate,depth,iterations,accuracy
0,1,Bigrams,0.15,10,700,0.825893
1,2,Bigrams,0.15,8,700,0.808036
2,3,Bigrams,0.15,10,700,0.837054


In [8]:
print(f"Generalized accuracy - {outer_folds_df['accuracy'].values.mean()} +/- {outer_folds_df['accuracy'].values.std()} \n ")

Generalized accuracy - 0.8236607142857143 +/- 0.011951174407893587 
 


In [9]:
hyperparam_voting_df = (outer_folds_df
                        .groupby(["vectorizer", "learning_rate", "depth", "iterations"])["outer-fold"]
                        .agg("count")
                        .reset_index()
                        .rename(columns={"outer-fold": "count"}))
display(hyperparam_voting_df)

Unnamed: 0,vectorizer,learning_rate,depth,iterations,count
0,Bigrams,0.15,8,700,1
1,Bigrams,0.15,10,700,2


In [10]:
best_params = hyperparam_voting_df[hyperparam_voting_df["count"] == hyperparam_voting_df["count"].max()].drop(columns=["count"]).to_dict("records")[0]
best_params

{'vectorizer': 'Bigrams',
 'learning_rate': 0.15,
 'depth': 10,
 'iterations': 700}

In [11]:
vectorizer = vectorizers[best_params.pop("vectorizer")]
vectorizer

In [12]:
corpus_featurized = vectorizer.fit_transform(corpus)

train = Pool(data=corpus_featurized, label=targets)
train_clf(train=train, **best_params)


<catboost.core.CatBoostClassifier at 0x7f8fe56046a0>