# Day 09. Exercise 04
# Pipelines and OOP

## 0. Imports

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from tqdm.notebook import tqdm
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from joblib import dump

## 1. Preprocessing pipeline

Create three custom transformers, the first two out of which will be used within a [Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html).

1. `FeatureExtractor()` class:
 - Takes a dataframe with `uid`, `labname`, `numTrials`, `timestamp` from the file [`checker_submits.csv`](https://drive.google.com/file/d/14voc4fNJZiLEFaZyd8nEG-lQt5JjatYw/view?usp=sharing).
 - Extracts `hour` from `timestamp`.
 - Extracts `weekday` from `timestamp` (numbers).
 - Drops the `timestamp` column.
 - Returns the new dataframe.


2. `MyOneHotEncoder()` class:
 - Takes the dataframe from the result of the previous transformation and the name of the target column.
 - Identifies all the categorical features and transforms them with `OneHotEncoder()`. If the target column is categorical too, then the transformation should not apply to it.
 - Drops the initial categorical features.
 - Returns the dataframe with the features and the series with the target column.


3. `TrainValidationTest()` class:
 - Takes `X` and `y`.
 - Returns `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` (`test_size=0.2`, `random_state=21`, `stratified`).


In [2]:
class FeatureExtractor(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        df = X.copy()
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df['hour'] = df['timestamp'].dt.hour
        df['dayofweek'] = df['timestamp'].dt.dayofweek
        df.drop(columns='timestamp', inplace=True)
        return df

In [3]:
class MyOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, df, target_name='dayofweek'):
        self.target_name = target_name

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        df = X.copy()
        cat_cols = [col for col in df.columns if df[col].dtype.name in ['object','category', 'bool'] and col != self.target_name]
        if not cat_cols:
            return df
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        encoded_data = encoder.fit_transform(df[cat_cols])
        features = encoder.get_feature_names_out(cat_cols)

        encoded_df = pd.DataFrame(encoded_data, columns=features, index=df.index)
        df_processed = pd.concat([
            df.drop(columns=cat_cols), 
            encoded_df  
        ], axis=1)
        
        df_processed['dayofweek'] = df['dayofweek']
        return df_processed

In [4]:
class TrainValidationTest(BaseEstimator, TransformerMixin):
    def __init__(self, test_size=0.2, random_state=21):
        self.test_size = test_size  
        self.random_state = random_state  
        
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)
        X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=21, stratify=y_train)
        return X_train, X_valid, X_test, y_train, y_valid, y_test
    

## 2. Model selection pipeline

`ModelSelection()` class

 - Takes a list of `GridSearchCV` instances and a dict where the keys are the indexes from that list and the values are the names of the models, the example is below in the reverse order (from high-level to low-level perspective):

```
ModelSelection(grids, grid_dict)

grids = [gs_svm, gs_tree, gs_rf]

gs_svm = GridSearchCV(estimator=svm, param_grid=svm_params, scoring='accuracy', cv=2, n_jobs=jobs), where jobs you can specify by yourself

svm_params = [{'kernel':('linear', 'rbf', 'sigmoid'), 'C':[0.01, 0.1, 1, 1.5, 5, 10], 'gamma': ['scale', 'auto'], 'class_weight':('balanced', None), 'random_state':[21], 'probability':[True]}]
```

 - Method `choose()` takes `X_train`, `y_train`, `X_valid`, `y_valid` and returns the name of the best classifier among all the models on the validation set
 - Method `best_results()` returns a dataframe with the columns `model`, `params`, `valid_score` where the rows are the best models within each class of models.

```
model	params	valid_score
0	SVM	{'C': 10, 'class_weight': None, 'gamma': 'auto...	0.772727
1	Decision Tree	{'class_weight': 'balanced', 'criterion': 'gin...	0.801484
2	Random Forest	{'class_weight': None, 'criterion': 'entropy',...	0.855288
```

 - When you iterate through the parameters of a model class, print the name of that class and show the progress using `tqdm.notebook`, in the end of the cycle print the best model of that class.

```
Estimator: SVM
100%
125/125 [01:32<00:00, 1.36it/s]
Best params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Best training accuracy: 0.773
Validation set accuracy score for best params: 0.878 

Estimator: Decision Tree
100%
57/57 [01:07<00:00, 1.22it/s]
Best params: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 21, 'random_state': 21}
Best training accuracy: 0.801
Validation set accuracy score for best params: 0.867 

Estimator: Random Forest
100%
284/284 [06:47<00:00, 1.13s/it]
Best params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 22, 'n_estimators': 50, 'random_state': 21}
Best training accuracy: 0.855
Validation set accuracy score for best params: 0.907 

Classifier with best validation set accuracy: Random Forest
```

In [5]:
class ModelSelection:
    def __init__(self, grids, grid_dict):

        self.grids = grids
        self.grid_dict = grid_dict
        self.results = []
    
    def choose(self, X_train, y_train, X_valid, y_valid):

        best_score = -1
        best_model_name = ""
        
        for idx, gs in enumerate(self.grids):
            model_name = self.grid_dict[idx]
            print(f"\nEstimator: {model_name}")
            gs.fit(X_train, y_train)
            totalcombos = len(gs.cv_results_['params'])
            tqdm_bar = tqdm(total=totalcombos)
            tqdm_bar.update(totalcombos)
            tqdm_bar.close()
            valid_score = gs.best_estimator_.score(X_valid, y_valid)
            self.results.append({
                'model': model_name,
                'params': gs.best_params_,
                'train_score': gs.best_score_,
                'valid_score': valid_score
            })
            print(f"Best params: {gs.best_params_}")
            print(f"Best training accuracy: {gs.best_score_:.5f}")
            print(f"Validation set accuracy score for best params: {valid_score:.5f}\n")
            if valid_score > best_score:
                best_score = valid_score
                best_model_name = model_name
        
        print(f"\nClassifier with best validation set accuracy: {best_model_name}")
        return best_model_name
    
    def best_results(self):
        df = pd.DataFrame(self.results)
        return df[['model', 'params', 'valid_score']]

## 3. Finalization

`Finalize()` class
 - Takes an estimator.
 - Method `final_score()` takes `X_train`, `y_train`, `X_test`, `y_test` and returns the accuracy of the model as in the example below:
```
final.final_score(X_train, y_train, X_test, y_test)
Accuracy of the final model is 0.908284023668639
```
 - Method `save_model()` takes a path, saves the model to this path and prints that the model was successfully saved.

In [6]:
class Finalize:
    def __init__(self,estimator):

        self.estimator = estimator
        
    
    def final_score(self, X_train, y_train, X_test, y_test):

        self.estimator.fit(X_train, y_train)
        predict = self.estimator.predict(X_test)
        accuracy = accuracy_score(y_test, predict)
        print(f"Accuracy of the final model is {accuracy}")
        return accuracy
    
    def save_model(self, path):
        try:
            dump(self.estimator, path)
            print('The model was successfully saved')
        except Exception as e:
            print(f"Failed to save model: {e}")

## 4. Main program

1. Load the data from the file (****name of file****).
2. Create the preprocessing pipeline that consists of two custom transformers: `FeatureExtractor()` and `MyOneHotEncoder()`:
```
preprocessing = Pipeline([('feature_extractor', FeatureExtractor()), ('onehot_encoder', MyOneHotEncoder('dayofweek'))])
```
3. Use that pipeline and its method `fit_transform()` on the initial dataset.
```
data = preprocessing.fit_transform(df)
```
4. Get `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` using `TrainValidationTest()` and the result of the pipeline.
5. Create an instance of `ModelSelection()`, use the method `choose()` applying it to the models that you want and parameters that you want, get the dataframe of the best results.
6. create an instance of `Finalize()` with your best model, use method `final_score()` and save the model in the format: `name_of_the_model_{accuracy on test dataset}.sav`.

That is it, congrats!

In [7]:
df = pd.read_csv('../../datasets/checker_submits.csv')
preprocessing = Pipeline([('feature_extractor', FeatureExtractor()), ('onehot_encoder', MyOneHotEncoder('dayofweek'))])
data = preprocessing.fit_transform(df)
data


Unnamed: 0,numTrials,hour,dayofweek,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,1,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,9,20,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1682,6,20,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1683,7,20,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1684,8,20,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [8]:
X = data.drop('dayofweek', axis=1)
y = data['dayofweek']
train = TrainValidationTest()
X_train, X_valid, X_test, y_train, y_valid, y_test = train.transform(X,y)
X_valid.info()

<class 'pandas.core.frame.DataFrame'>
Index: 270 entries, 1053 to 744
Data columns (total 43 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   numTrials         270 non-null    int64  
 1   hour              270 non-null    int32  
 2   uid_user_0        270 non-null    float64
 3   uid_user_1        270 non-null    float64
 4   uid_user_10       270 non-null    float64
 5   uid_user_11       270 non-null    float64
 6   uid_user_12       270 non-null    float64
 7   uid_user_13       270 non-null    float64
 8   uid_user_14       270 non-null    float64
 9   uid_user_15       270 non-null    float64
 10  uid_user_16       270 non-null    float64
 11  uid_user_17       270 non-null    float64
 12  uid_user_18       270 non-null    float64
 13  uid_user_19       270 non-null    float64
 14  uid_user_2        270 non-null    float64
 15  uid_user_20       270 non-null    float64
 16  uid_user_21       270 non-null    float64
 17 

In [9]:
svm_params = [{'kernel':('linear', 'rbf', 'sigmoid'), 'C':[0.01, 0.1, 1, 1.5, 5, 10], 'gamma': ['scale', 'auto'], 'class_weight':('balanced', None), 'random_state':[21], 'probability':[True]}]
svc = SVC(probability=True, random_state=21)
gs_svm = GridSearchCV(estimator=svc, param_grid=svm_params, scoring='accuracy', cv=2, n_jobs=-1)
tree = DecisionTreeClassifier(random_state=21)
tree_params = {'max_depth': range(1,50),
            'criterion': ['entropy', 'gini'],
            'class_weight': ['balanced', None]}
gs_tree = GridSearchCV(estimator=tree, param_grid=tree_params, scoring='accuracy', cv=2, n_jobs=-1)
RandForest = RandomForestClassifier(random_state=21)
rf_params = {'n_estimators': [5, 10, 50, 100],
            'max_depth':range(1,50),
            'class_weight': ['balanced', None],
            'criterion': ['entropy', 'gini'],
            }
gs_rf = GridSearchCV(estimator=RandForest, param_grid=rf_params, scoring='accuracy')
grids = [gs_tree, gs_rf]
grid_dict = {
    #0 : 'SVC',
    0 : 'DecisionTreeClassifier',
    1 : 'RandomForestClassifier' 
}
ModSel = ModelSelection(grids, grid_dict)
best_model_name = ModSel.choose(X_train, y_train, X_valid, y_valid)


Estimator: DecisionTreeClassifier


  0%|          | 0/196 [00:00<?, ?it/s]

Best params: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 22}
Best training accuracy: 0.80891
Validation set accuracy score for best params: 0.86667


Estimator: RandomForestClassifier


  0%|          | 0/784 [00:00<?, ?it/s]

Best params: {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 27, 'n_estimators': 100}
Best training accuracy: 0.89514
Validation set accuracy score for best params: 0.89259


Classifier with best validation set accuracy: RandomForestClassifier


In [10]:
best_mod = ModSel.best_results()
best_mod

Unnamed: 0,model,params,valid_score
0,DecisionTreeClassifier,"{'class_weight': 'balanced', 'criterion': 'gin...",0.866667
1,RandomForestClassifier,"{'class_weight': 'balanced', 'criterion': 'ent...",0.892593


In [11]:
best_params = best_mod[best_mod['model'] == best_model_name]['params'].values[0]
if best_model_name == "SVC":
    model = SVC(**best_params)
elif best_model_name == "DecisionTreeClassifier":
    model = DecisionTreeClassifier(**best_params)
elif best_model_name == "RandomForestClassifier":
    model = RandomForestClassifier(**best_params)
else:
    raise ValueError("unknown model")

final = Finalize(model)
final.final_score(X_train, y_train, X_test, y_test)
model.fit(X_train, y_train)
accuracy = accuracy_score(y_test, model.predict(X_test))
final.save_model(f"Models/{best_model_name}_{accuracy}")

Accuracy of the final model is 0.9112426035502958
The model was successfully saved
