In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
# Dataset
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Define features (X) and target response (y)
X_train = train.drop(columns=['ID', 'Customer_Churn'])
y_train = train['Customer_Churn']

X_test = test.drop(columns=['ID', 'Customer_Churn'])
y_test = test['Customer_Churn']

## SelectKBest, with k = 5 features

In [3]:
k = 5  # You can change this value to select a different number of features
selector = SelectKBest(f_classif, k=k) #ANOVA
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Fit the selector to the training data
selector.fit(X_train, y_train)

# Transform the training and test data to select the top k features
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

# Get the boolean mask indicating which features are selected
selected_features_mask = selector.get_support()

# Get the names of the selected features
selected_features = X_train.columns[selected_features_mask]

# Print the names of the selected features
print("Selected features:", selected_features)

Selected features: Index(['Age', 'AI_Interaction_Level', 'Satisfaction_with_AI_Services',
       'AI_Personalization_Effectiveness', 'Change_in_Usage_Patterns'],
      dtype='object')


In [4]:
k = 5  # Number of features selected
selector = SelectKBest(mutual_info_classif, k=k) #discrete
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Fit the selector to the training data
selector.fit(X_train, y_train)

# Transform the training and test data to select the top k features
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

# Get the boolean mask indicating which features are selected
selected_features_mask = selector.get_support()

# Get the names of the selected features
selected_features = X_train.columns[selected_features_mask]

# Print the names of the selected features
print("Selected features:", selected_features)

Selected features: Index(['Age', 'AI_Interaction_Level', 'Satisfaction_with_AI_Services',
       'AI_Personalization_Effectiveness', 'Customer_Service_Interactions'],
      dtype='object')


#### 5 "best" features: ['Age', 'AI_Interaction_Level', 'Satisfaction_with_AI_Services', 'AI_Personalization_Effectiveness', 'Change_in_Usage_Patterns'] or ['Age', 'AI_Interaction_Level', 'Satisfaction_with_AI_Services','AI_Personalization_Effectiveness', 'Customer_Service_Interactions']


## Forward selection, with 5 folds CV to find optimal number of features

In [5]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import cross_val_score
import numpy as np

# Define the range of number of features to select
min_features = 1
max_features = X_train.shape[1] - 1  # Maximum is one less than the total number of features

# Initialize variables to store the best score and number of features
best_score = 0
best_n_features = 0

model = LogisticRegression()

# Loop through the range of number of features
for n_features in range(min_features, max_features + 1):
    # Create a SequentialFeatureSelector with the current number of features
    selector = SequentialFeatureSelector(model, n_features_to_select=n_features, direction='forward')
    
    # Fit the selector and transform the data
    X_train_selected = selector.fit_transform(X_train, y_train)
    
    # Evaluate the model using cross-validation
    scores = cross_val_score(model, X_train_selected, y_train, cv=5, scoring='accuracy')
    
    # Compute the average score
    mean_score = np.mean(scores)
    
    # Update the best score and number of features if the current score is better
    if mean_score > best_score:
        best_score = mean_score
        best_n_features = n_features

# Print the best number of features and corresponding score
print(f"Best number of features: {best_n_features}")
print(f"Best score: {best_score}")


Best number of features: 4
Best score: 0.5888


In [6]:
# Refit the selector with the best number of features
best_selector = SequentialFeatureSelector(model, n_features_to_select=best_n_features, direction='forward')
best_selector.fit(X_train, y_train)

# Get the boolean mask indicating which features were selected
selected_features_mask = best_selector.get_support()

# Get the names of the selected features
selected_features = X_train.columns[selected_features_mask]

# Print the names of the selected features
print("Selected features:", selected_features)


Selected features: Index(['Age', 'AI_Interaction_Level', 'AI_Response_Time',
       'Overall_Usage_Frequency'],
      dtype='object')


#### 4 best features ['Age', 'AI_Interaction_Level', 'AI_Response_Time','Overall_Usage_Frequency']

fyi I got an error (max iter reached) if I try using backward selection.