In [1]:
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression


In [5]:
# --- Ensure consistent working directory for data loading ---
# This block dynamically sets the current working directory to the Git repository root.
# This makes data paths reliable for all collaborators, regardless of where they open the notebook.

current_dir = os.getcwd()
repo_root = current_dir
while not os.path.exists(os.path.join(repo_root, '.git')):
    # Move up one directory
    parent_dir = os.path.dirname(repo_root)
    if parent_dir == repo_root: # Reached filesystem root, .git not found
        raise FileNotFoundError(
            "Could not find the .git directory. "
            "Please ensure you are running this code from within a Git repository."
        )
    repo_root = parent_dir

if os.getcwd() != repo_root:
    os.chdir(repo_root)
    print(f"Working directory set to: {os.getcwd()}") # Informative print for users


# --- Data Loading ---
# Path to the data file, relative to the repository root.
data_file_name = 'df_eng_customer_purchasing_features.csv'
data_file_path = os.path.join('src', 'data', data_file_name)

try:
    df = pd.read_csv(data_file_path)
    print(f"Successfully loaded '{data_file_name}'.")
    #print(df.head())
except FileNotFoundError:
    print(f"Error: The file '{data_file_name}' was not found at '{data_file_path}'.")
    print("Please ensure it exists in the 'src/data/' folder relative to the repository root.")
except Exception as e:
    print(f"An error occurred during data loading: {e}")

# Create a copy for feature engineering to keep the original data safe
df_LR = df.copy()
print("Original DataFrame shape:", df_LR.shape)

Successfully loaded 'df_eng_customer_purchasing_features.csv'.
Original DataFrame shape: (238, 11)


In [6]:
df_LR

Unnamed: 0,user_id,customer_value_score,churn_risk_score,growth_potential_score,spend_per_purchase,spend_to_income_ratio,is_champion,age,annual_income,region_South,region_West
0,1,0.136490,0.829060,74,16.666667,0.004444,False,25,45000,False,False
1,2,0.469039,0.470085,100,19.444444,0.006364,False,34,55000,True,False
2,3,0.716117,0.282051,57,22.727273,0.007692,False,45,65000,False,True
3,4,0.000000,1.000000,44,15.000000,0.005000,False,22,30000,False,False
4,5,0.182326,0.778205,100,16.923077,0.004681,False,29,47000,False,False
...,...,...,...,...,...,...,...,...,...,...,...
233,234,0.606550,0.399145,44,22.500000,0.007500,False,40,60000,False,True
234,235,0.574603,0.422222,59,21.500000,0.007288,False,38,59000,False,False
235,236,0.972061,0.035470,44,23.333333,0.008514,True,54,74000,True,False
236,237,0.433089,0.562393,40,20.000000,0.006923,False,32,52000,False,True
