In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Assuming you have an Excel file named 'customer_churn_large_dataset.xlsx' in your current directory
data = pd.read_excel('/kaggle/input/cust-churn-task/customer_churn_large_dataset.xlsx')

# Define your features and target variable
X = data.drop('Churn', axis=1)  # Assuming 'churn' is the target variable
y = data['Churn']

# Split the data into training, validation, and test sets
# Here, we use an 80-10-10 split, adjust percentages as needed
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Print the shapes of the resulting datasets to verify the split
print("Train set shape:", X_train.shape)
print("Validation set shape:", X_valid.shape)
print("Test set shape:", X_test.shape)


Train set shape: (80000, 8)
Validation set shape: (10000, 8)
Test set shape: (10000, 8)


In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Load the dataset
data = pd.read_excel('/kaggle/input/cust-churn-task/customer_churn_large_dataset.xlsx')

# Drop non-numeric columns like 'CustomerID' and 'Name'
data = data.drop(['CustomerID', 'Name'], axis=1)

# Encode categorical variables like 'Gender' and 'Location' using one-hot encoding
data = pd.get_dummies(data, columns=['Gender', 'Location'], drop_first=True)

# Define your features and target variable
X = data.drop('Churn', axis=1)  # Assuming 'Churn' is the target variable
y = data['Churn']

# Split the data into training, validation, and test sets
# Here, we use an 80-10-10 split, adjust percentages as needed
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialize a Random Forest Classifier (or any tree-based model)
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Get feature importances
feature_importances = rf_classifier.feature_importances_

# Create a DataFrame to store feature names and their importances
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})

# Sort the features by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print the top N most important features (e.g., top 5)
top_n = 5
top_features = feature_importance_df.head(top_n)

# Print the top features
print("Top", top_n, "most important features:")
print(top_features)

Top 5 most important features:
                      Feature  Importance
2                Monthly_Bill    0.317095
3              Total_Usage_GB    0.290777
0                         Age    0.188228
1  Subscription_Length_Months    0.147307
4                 Gender_Male    0.015698


In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

# Load the dataset
data = pd.read_excel('/kaggle/input/cust-churn-task/customer_churn_large_dataset.xlsx')

# Drop non-numeric columns like 'CustomerID' and 'Name'
data = data.drop(['CustomerID', 'Name'], axis=1)

# Encode categorical variables like 'Gender' and 'Location' using one-hot encoding
data = pd.get_dummies(data, columns=['Gender', 'Location'], drop_first=True)

# Define your features and target variable
X = data.drop('Churn', axis=1)  # Assuming 'Churn' is the target variable
y = data['Churn']

# Split the data into training, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize different classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
}

# Perform model selection and evaluation
results = {}
for classifier_name, classifier in classifiers.items():
    # Fit the model
    classifier.fit(X_train, y_train)
    
    # Evaluate the model using cross-validation (you can choose a different evaluation metric)
    cv_score = cross_val_score(classifier, X_train, y_train, cv=5, scoring='accuracy')
    
    # Store the mean cross-validation score
    results[classifier_name] = cv_score.mean()

# Print the results
for classifier_name, cv_score in results.items():
    print(f'{classifier_name}: Mean CV Accuracy = {cv_score:.4f}')

# Select the best model based on cross-validation results
best_model_name = max(results, key=results.get)
print(f'\nBest Model: {best_model_name}')

Random Forest: Mean CV Accuracy = 0.5001
Logistic Regression: Mean CV Accuracy = 0.5045
Gradient Boosting: Mean CV Accuracy = 0.5026

Best Model: Logistic Regression


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Load the dataset
data = pd.read_excel('/kaggle/input/cust-churn-task/customer_churn_large_dataset.xlsx')

# Drop non-numeric columns like 'CustomerID' and 'Name'
data = data.drop(['CustomerID', 'Name'], axis=1)

# Encode categorical variables like 'Gender' and 'Location' using one-hot encoding
data = pd.get_dummies(data, columns=['Gender', 'Location'], drop_first=True)

# Define your features and target variable
X = data.drop('Churn', axis=1)  # Assuming 'Churn' is the target variable
y = data['Churn']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Define a parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2'],
}


# Initialize GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the training data to find the best hyperparameters
grid_search.fit(X_train, y_train)

# Get the best hyperparameters from the grid search
best_params = grid_search.best_params_

# Create a Random Forest Classifier with the best hyperparameters
best_rf_classifier = RandomForestClassifier(random_state=42, **best_params)

# Train the best model on the training data
best_rf_classifier.fit(X_train, y_train)

# Evaluate the best model on the test data
accuracy = best_rf_classifier.score(X_test, y_test)

# Print the best hyperparameters and test accuracy
print("Best Hyperparameters:", best_params)
print("Test Accuracy:", accuracy)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

# Assuming you have your data in X and y, split it into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train a RandomForestClassifier (or another classifier)
best_rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
best_rf_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = best_rf_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(y_test, y_pred)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test, y_pred)
print("Recall:", recall)

# Calculate F1-score
f1 = f1_score(y_test, y_pred)
print("F1-Score:", f1)

# Generate and print the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load the dataset
data = pd.read_excel('/kaggle/input/cust-churn-task/customer_churn_large_dataset.xlsx')

# Drop non-numeric columns like 'CustomerID' and 'Name'
data = data.drop(['CustomerID', 'Name'], axis=1)

# Encode categorical variables like 'Gender' and 'Location' using one-hot encoding
data = pd.get_dummies(data, columns=['Gender', 'Location'], drop_first=True)

# Define your features and target variable
X = data.drop('Churn', axis=1)  # Assuming 'Churn' is the target variable
y = data['Churn']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to balance the classes
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Define a parameter grid for hyperparameter tuning
# Define a reduced parameter grid
param_grid = {
    'n_estimators': [100, 200],          # Reduced number of trees
    'max_depth': [None, 10],            # Reduced depth options
    'min_samples_split': [2, 5],       # Reduced min_samples_split options
    'min_samples_leaf': [1, 2],        # Reduced min_samples_leaf options
    'max_features': ['sqrt', 'log2'],  # Reduced max_features options
}

# Rest of your code remains the same


# Initialize GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the resampled training data to find the best hyperparameters
grid_search.fit(X_train_resampled, y_train_resampled)

# Get the best hyperparameters from the grid search
best_params = grid_search.best_params_

# Create a Random Forest Classifier with the best hyperparameters
best_rf_classifier = RandomForestClassifier(random_state=42, **best_params)

# Train the best model on the resampled training data
best_rf_classifier.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test data
y_pred = best_rf_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(y_test, y_pred)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test, y_pred)
print("Recall:", recall)

# Calculate F1-score
f1 = f1_score(y_test, y_pred)
print("F1-Score:", f1)

# Generate and print the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import shap
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_excel('/kaggle/input/cust-churn-task/customer_churn_large_dataset.xlsx')

# Drop non-numeric columns like 'CustomerID' and 'Name'
data = data.drop(['CustomerID', 'Name'], axis=1)

# Encode categorical variables like 'Gender' and 'Location' using one-hot encoding
data = pd.get_dummies(data, columns=['Gender', 'Location'], drop_first=True)

# Define your features and target variable
X = data.drop('Churn', axis=1)  # Assuming 'Churn' is the target variable
y = data['Churn']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to balance the classes
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Define a parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],           # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],           # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],          # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],            # Minimum number of samples required to be at a leaf node
    'max_features': ['sqrt', 'log2'],         # Number of features to consider at each split
}

# Initialize GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the resampled training data to find the best hyperparameters
grid_search.fit(X_train_resampled, y_train_resampled)

# Get the best hyperparameters from the grid search
best_params = grid_search.best_params_

# Create a Random Forest Classifier with the best hyperparameters
best_rf_classifier = RandomForestClassifier(random_state=42, **best_params)

# Train the best model on the resampled training data
best_rf_classifier.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test data
y_pred = best_rf_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(y_test, y_pred)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test, y_pred)
print("Recall:", recall)

# Calculate F1-score
f1 = f1_score(y_test, y_pred)
print("F1-Score:", f1)

# Generate and print the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# SHAP values for feature importance
explainer = shap.TreeExplainer(best_rf_classifier)
shap_values = explainer.shap_values(X_test)

# Summary plot of feature importance
shap.summary_plot(shap_values, X_test, plot_type="bar")
plt.show()
