# Modelling with weights for penalty different classes plus feature selection

In [46]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from pygam import LogisticGAM, s, f
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
df = pd.read_excel('../data/Задание (пролонгация).xlsx', sheet_name='data_full')

# LogReg

## Features and Target

In [3]:
del df['ID']  # ID is not needed

In [4]:
# Make target binary
df['target'] = df['target'].apply(lambda x: abs(x) if x == -1 else 0)

In [6]:
X = df.drop('target', axis=1)
y = df['target']

In [7]:
feature_to_save_df = pd.read_csv(f'../data/features_to_save.csv')

In [8]:
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

In [9]:
scaler = StandardScaler()
X_numerical_scaled = pd.DataFrame(scaler.fit_transform(X[numerical_cols]),
                                  columns=numerical_cols)
X_categorical_encoded = pd.get_dummies(X[categorical_cols])
X_preprocessed = pd.concat([X_numerical_scaled, X_categorical_encoded], axis=1)

In [10]:
# filter by features

In [11]:
X_preprocessed1 = X_preprocessed[feature_to_save_df['feature']].copy()

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed1, y, test_size=0.2, random_state=42)

In [32]:
class_weights = {0: 1, 1: 20}

## Fit

In [33]:
model = LogisticRegression(class_weight=class_weights)
model.fit(X_train, y_train)

## Predict

In [34]:
y_pred = model.predict(X_test)

## Evaluation

In [35]:
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.00      0.01      1517
           1       0.30      1.00      0.46       656

    accuracy                           0.30      2173
   macro avg       0.46      0.50      0.23      2173
weighted avg       0.53      0.30      0.14      2173

Confusion Matrix:
 [[   5 1512]
 [   3  653]]


In [36]:
model.classes_

array([0, 1])

In [38]:
# it's hard to push precision higher than 0.3

# Decision Tree

## Fit

In [39]:
model = DecisionTreeClassifier(class_weight=class_weights)
model.fit(X_train, y_train)

## Predict

In [40]:
y_pred = model.predict(X_test)

## Evaluation

In [42]:
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.01      0.01      1517
           1       0.30      0.99      0.46       656

    accuracy                           0.30      2173
   macro avg       0.50      0.50      0.24      2173
weighted avg       0.57      0.30      0.15      2173

Confusion Matrix:
 [[   9 1508]
 [   4  652]]


# GAM

## Features Weighting

In [43]:
weights = np.array([1 if y == 0 else 10 for y in y_train])

In [44]:
gam = LogisticGAM().fit(X_train, y_train, weights=weights)  # 11:10

  return dist.levels / (mu * (dist.levels - mu))
  self.link.gradient(mu, self.distribution) ** 2
  self.link.gradient(mu, self.distribution) ** 2
  elp = np.exp(lp)
  return dist.levels * elp / (elp + 1)


## Evaluation

In [47]:
y_pred = gam.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.3041877588587207
              precision    recall  f1-score   support

           0       0.69      0.01      0.01      1517
           1       0.30      0.99      0.46       656

    accuracy                           0.30      2173
   macro avg       0.50      0.50      0.24      2173
weighted avg       0.57      0.30      0.15      2173



In [48]:
# similar effects