In [23]:
import pandas as pd
import numpy as np
from pygam import LogisticGAM, s, f
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt

In [2]:
fi_log_reg_df  = pd.read_csv('feature_importance_log_reg.csv', sep=';')
fi_tree_df  = pd.read_csv('feature_importance_tree.csv', sep=';')

In [3]:
fi_log_reg_df = fi_log_reg_df.rename(columns={'index': 'feature'})
fi_tree_df = fi_tree_df.rename(columns={'index': 'feature'})

# Select all feature with importance higher than 0.5

In [8]:
fi_log_reg_list = fi_log_reg_df.loc[fi_log_reg_df['Abs_Importance'] > 0.5, 'feature'].to_list()
fi_tree_list = fi_tree_df.loc[fi_tree_df['Abs_Importance'] > 0.5, 'feature'].to_list()

In [10]:
len(fi_log_reg_list)

45

In [11]:
len(fi_tree_list)

45

In [12]:
set(fi_log_reg_list) - set(fi_tree_list)

set()

In [13]:
set(fi_tree_list) - set(fi_log_reg_list)  # feature importance lists are identical

set()

In [14]:
len(set(fi_tree_list))

45

In [15]:
feature_to_save_df = pd.DataFrame()
feature_to_save_df['feature'] = fi_tree_list

In [16]:
feature_to_save_df.to_csv(f'../data/features_to_save.csv', index=False)

In [4]:
feature_to_save_df = pd.read_csv(f'../data/features_to_save.csv')

In [5]:
feature_to_save_df

Unnamed: 0,feature
0,Model_GRAND SANTA FE
1,Model_3
2,passport_region_Томская область
3,passport_region_Алтайский край
4,passport_region_н.д.
5,passport_region_Республика Хакасия
6,Model_PAJERO
7,Model_DISCOVERY
8,Model_TIIDA
9,passport_region_Кировская область


# GAM with short list of features

In [6]:
df = pd.read_excel('../data/Задание (пролонгация).xlsx', sheet_name='data_full')

In [7]:
# Make target binary, force it to 1 or 0
df['target'] = df['target'].apply(lambda x: abs(x) if x == -1 else 0)

In [8]:
X = df.drop('target', axis=1)
y = df['target']

In [9]:
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

In [10]:
scaler = StandardScaler()
X_numerical_scaled = pd.DataFrame(scaler.fit_transform(X[numerical_cols]),
                                  columns=numerical_cols)
X_categorical_encoded = pd.get_dummies(X[categorical_cols])
X_preprocessed = pd.concat([X_numerical_scaled, X_categorical_encoded], axis=1)

In [13]:
# filter by features

In [16]:
X_preprocessed1 = X_preprocessed[feature_to_save_df['feature']].copy()

In [17]:
X_preprocessed1

Unnamed: 0,Model_GRAND SANTA FE,Model_3,passport_region_Томская область,passport_region_Алтайский край,passport_region_н.д.,passport_region_Республика Хакасия,Model_PAJERO,Model_DISCOVERY,Model_TIIDA,passport_region_Кировская область,...,passport_region_Чувашская Республика,Model_L-200,passport_region_Ульяновская область,Model_GLS,Model_EXPLORER,passport_region_Республика Мордовия,Model_RANGE ROVER SPORT,Model_6-СЕРИЯ,passport_region_Оренбургская область,Model_2111
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10858,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10859,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10860,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10861,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed1, y, test_size=0.2, random_state=42)

### Fit

In [19]:
gam = LogisticGAM().fit(X_train, y_train)  # 8:57

In [20]:
gam.summary()

LogisticGAM                                                                                               
Distribution:                      BinomialDist Effective DoF:                                     45.0941
Link Function:                        LogitLink Log Likelihood:                                 -5252.4549
Number of Samples:                         8690 AIC:                                             10595.098
                                                AICc:                                           10595.6004
                                                UBRE:                                               3.2234
                                                Scale:                                                 1.0
                                                Pseudo R-Squared:                                   0.0109
Feature Function                  Lambda               Rank         EDoF         P > x        Sig. Code   
s(0)                              [0.

 
Please do not make inferences based on these values! 

Collaborate on a solution, and stay up to date at: 
github.com/dswah/pyGAM/issues/163 

  gam.summary()


## Evaluation

In [22]:
y_pred = gam.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.6921306948918545
              precision    recall  f1-score   support

           0       0.70      0.98      0.82      1517
           1       0.30      0.02      0.03       656

    accuracy                           0.69      2173
   macro avg       0.50      0.50      0.42      2173
weighted avg       0.58      0.69      0.58      2173



Precision: Only 30% of the instances predicted as class 1 are actually class 1.
Recall: The model correctly identifies only 2% of all actual class 1 instances, which is very low.
F1-score: An extremely low F1-score of 3% for class 1, indicating poor performance in predicting the positive class.

So perdict Class 1 almost impossible

## Feature Importance

In [29]:
def extract_feature_importance(gam, feature_names):
    data = []
    for i, feature_name in enumerate(feature_names):
        XX = gam.generate_X_grid(term=i)
        pdep, confi = gam.partial_dependence(term=i, X=XX, width=0.95)
        confi_range = confi[:, 1] - confi[:, 0]
        avg_confi_range = np.mean(confi_range)

        data.append({'Feature': feature_name, 'Avg Conf Int Range': avg_confi_range})

    feature_importance_df = pd.DataFrame(data)
    feature_importance_df = feature_importance_df.sort_values(by='Avg Conf Int Range', ascending=False)

    return feature_importance_df

In [30]:
feature_names = X_train.columns
feature_importance_df = extract_feature_importance(gam, feature_names)

In [31]:
feature_importance_df

Unnamed: 0,Feature,Avg Conf Int Range
44,Model_2111,1410.32556
42,Model_6-СЕРИЯ,1410.325555
36,Model_L-200,1410.325552
5,passport_region_Республика Хакасия,1381.163643
41,Model_RANGE ROVER SPORT,1233.074908
31,Model_OUTBACK,1221.34719
22,passport_region_Республика Тыва,1221.347185
25,Model_MULTIVAN,1221.347182
17,Model_VITO,1221.347179
28,passport_region_Тюменская область,1213.216588


In [32]:
# ToDo weighted models?