In [1]:
import pandas as pd # For DataFrame manipulation (assuming df_clean is loaded)
import numpy as np # For numerical operations, especially for handling arrays

In [2]:
parquet_file_path = r"C:\Users\comat\GitProjects\customer-churn-ai\data\telco_churn_with_hf_sentiment.parquet"

try:
    df_feng = pd.read_parquet(parquet_file_path)
    print("Data Loaded Successfully! Sensational!\n")
    print(f"Shape of the loaded DataFrame: {df_feng.shape}\n")
    df_feng.info() # Display DataFrame info to confirm contents and dtypes
except FileNotFoundError:
    print(f"Error: File not found at {parquet_file_path}. Please check the path and try again.")
except Exception as e:
    print(f"An error occurred: {e}")

Data Loaded Successfully! Sensational!

Shape of the loaded DataFrame: (7043, 26)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contra

In [3]:
df_feng.head(5) # Display the first 5 rows of the DataFrame

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,CustomerReview,HF_neg,HF_nue,HF_pos,HF_Label
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,Yes,Electronic check,29.85,29.85,No,Really happy with my plan... no unexpected fee...,0.005294,0.021416,0.973291,Positive
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,No,Mailed check,56.95,1889.5,No,Really happy with my plan... phone support was...,0.004702,0.012738,0.98256,Positive
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,Yes,Mailed check,53.85,108.15,Yes,"Unfortunately, my service was... frustrated by...",0.906199,0.086496,0.007306,Negative
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,No,Bank transfer (automatic),42.3,1840.75,No,Consistently good signal... data plan is a gre...,0.004989,0.034239,0.960772,Positive
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,Yes,Electronic check,70.7,151.65,Yes,Not satisfied with the billing... billing erro...,0.930403,0.06082,0.008777,Negative


#### FE Part 1: Convert Text Fields
***Step 1:*** Identify Categorical Features and Convert Text Fields  
- Object Columns:  
    - customerID, gender, Partner, Dependents, PhoneService, MultipleLines, InternetService, OnlineSecurity, OnlineBackup, DeviceProtection, TechSupport, StreamingTV, StreamingMovies, Contract, PaperlessBilling, PaymentMethod, Churn, CustomerReview, HF_Label.
          
***Step 2:*** Drop Non-Informative Columns
- Drop `customerID`: Usually not a predictive feature. Drop it for modeling

In [4]:
if 'customerID' in df_feng.columns:
    df_feng_processed = df_feng.drop(columns=["customerID"], axis=1) # Drop customerID column
    print("Dropped 'customerID' column.")
else:
    df_feng_processed = df_feng.copy() # Create a copy to avoid modifying original df_feng if customerID wasn't there
    print("'customerID' column not found.")

Dropped 'customerID' column.


***Step 3:*** Encode the Target Variable Churn
- Best practice to handle `Churn` with a direct .map({'Yes': 1, 'No': 0}) because it's the target variable.

In [5]:
if 'Churn' in df_feng_processed.columns and df_feng_processed['Churn'].dtype == 'object':
    df_feng_processed['Churn'] = df_feng_processed['Churn'].map({'Yes': 1, 'No': 0})
    print("Encoded 'Churn' column to 1 (Yes) and 0 (No).\n")
    # Verify the mapping
    print("'Churn' column value counts after mapping:")
    print(df_feng_processed['Churn'].value_counts())
elif 'Churn' in df_feng_processed.columns:
    print("\n'Churn' column is already numeric or not 'object' type. Skipping mapping.")

Encoded 'Churn' column to 1 (Yes) and 0 (No).

'Churn' column value counts after mapping:
Churn
0    5174
1    1869
Name: count, dtype: int64


One Hot

In [6]:
# Identify categorical columns for one-hot encoding
# Churn should now be numeric, so it won't be selected here if Step 1.2 worked.
categorical_cols_to_encode = df_feng_processed.select_dtypes(include=['object']).columns.tolist()

# EXCLUDING CustomerReview for now as we are using its derived sentiment scores
if 'CustomerReview' in categorical_cols_to_encode:
    categorical_cols_to_encode.remove('CustomerReview')  # Using derived sentiment, not one-hot encoding raw CustomerReview text

print(f"\nCategorical columns to be one-hot encoded: {categorical_cols_to_encode}")

if categorical_cols_to_encode:
    # Handle specific values like "No internet service" or "No phone service"
    # These typically mean "No" for that particular service.
    cols_with_service_specific_no = [
        'MultipleLines', 'OnlineSecurity', 'OnlineBackup',
        'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies'
    ]

    print("\nCleaning service-specific columns with variations of No Values (e.g., 'No internet service' to 'No'.")
    for col in cols_with_service_specific_no:
        if col in df_feng_processed.columns and col in categorical_cols_to_encode: # Ensure column exists and is still object type
            df_feng_processed[col] = df_feng_processed[col].replace({
                'No internet service': 'No',
                'No phone service': 'No'
            })
            # After replacement, check unique values to ensure consistency for one-hot encoding
            # print(f"Unique values in {col} after cleaning: {df_feng_processed[col].unique()}")
    print("Finished cleaning service-specific 'No...' values.")

    # Perform one-hot encoding
    # drop_first=True helps to reduce multicollinearity by removing one redundant column per feature
    df_feng_processed = pd.get_dummies(df_feng_processed, columns=categorical_cols_to_encode, drop_first=True)
    print("\nOne-hot encoded the identified categorical columns.")
else:
    print("\nNo remaining categorical object columns found to one-hot encode (excluding CustomerReview).")

# Display info to see the changes
print("\nDataFrame info after all text field conversions:")
df_feng_processed.info()


Categorical columns to be one-hot encoded: ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'HF_Label']

Cleaning service-specific columns with variations of No Values (e.g., 'No internet service' to 'No'.
Finished cleaning service-specific 'No...' values.

One-hot encoded the identified categorical columns.

DataFrame info after all text field conversions:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 30 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   SeniorCitizen                          7043 non-null   int64  
 1   tenure                                 7043 non-null   int64  
 2   MonthlyCharges                         7043 non-null   float64
 

In [7]:
# Display head to see new columns (optional, can be many columns)
print("DataFrame head after text field conversions:")
print("\n",df_feng_processed.head(5))

DataFrame head after text field conversions:

    SeniorCitizen  tenure  MonthlyCharges  TotalCharges  Churn  \
0              0       1           29.85         29.85      0   
1              0      34           56.95       1889.50      0   
2              0       2           53.85        108.15      1   
3              0      45           42.30       1840.75      0   
4              0       2           70.70        151.65      1   

                                      CustomerReview    HF_neg    HF_nue  \
0  Really happy with my plan... no unexpected fee...  0.005294  0.021416   
1  Really happy with my plan... phone support was...  0.004702  0.012738   
2  Unfortunately, my service was... frustrated by...  0.906199  0.086496   
3  Consistently good signal... data plan is a gre...  0.004989  0.034239   
4  Not satisfied with the billing... billing erro...  0.930403  0.060820   

     HF_pos  gender_Male  ...  StreamingTV_Yes  StreamingMovies_Yes  \
0  0.973291        False  ...     

---
**Create Tenure Buckets:**


Objective:
* To create a new categorical feature that represents different stages of the customer lifecycle (e.g., new customers, medium-term customers, long-term loyal customers).
* Take the numerical tenure column (which represents how many months a customer has been with the company) and group its values into a set of predefined ranges or "buckets."
* This converts a continuous numerical feature into a discrete/categorical one.


In [8]:
# Examine Tenure Distribution
print("Descriptive statistics for 'tenure':")
print(df_feng_processed['tenure'].describe())

Descriptive statistics for 'tenure':
count    7043.000000
mean       32.371149
std        24.559481
min         0.000000
25%         9.000000
50%        29.000000
75%        55.000000
max        72.000000
Name: tenure, dtype: float64


In [9]:
# Define Bins and Labels
# Bins are the edges of our buckets
# The first edge should be just below min tenure (-1 for 0)
# The last edge should be just above max tenure (max is 72, use a larger number like 100 or np.inf)
tenure_bins = [-1, 12, 24, 36, 48, 60, 72] # Assuming tenure is 1-72, or 0-72. Adjust -1 if tenure can be 0.
                                       # If tenure can be 0, using -0.1 or similar can work.
                                       # Or ensure all values are within a bin. Let's assume min tenure is >= 1 for these bins.
                                       # For max 72, the last bin will be (60, 72].
# Define the labels for these bins
tenure_labels = [
    '0-12 Months',
    '13-24 Months',
    '25-36 Months',
    '37-48 Months',
    '49-60 Months',
    '61-72 Months'
]

# Create the new 'Tenure_Group' column
df_feng_processed['Tenure_Group'] = pd.cut(df_feng_processed['tenure'], bins=tenure_bins, labels=tenure_labels, right=True)

print("'Tenure_Group' column created.")
# print(df_feng_processed[['tenure', 'Tenure_Group']].head())
# print(df_feng_processed['Tenure_Group'].value_counts())

'Tenure_Group' column created.


In [10]:
# One-Hot Encode the New 'Tenure_Group' Column
if 'Tenure_Group' in df_feng_processed.columns:
    df_feng_processed = pd.get_dummies(df_feng_processed, columns=['Tenure_Group'], prefix='TenureGrp', drop_first=True)
    print("One-hot encoded the 'Tenure_Group' column.")
else:
    print("\nError: 'Tenure_Group' column not found for one-hot encoding.")

# Display info to see the final set of columns
print("\nDataFrame info after creating and encoding tenure buckets:")
df_feng_processed.info()

One-hot encoded the 'Tenure_Group' column.

DataFrame info after creating and encoding tenure buckets:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 35 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   SeniorCitizen                          7043 non-null   int64  
 1   tenure                                 7043 non-null   int64  
 2   MonthlyCharges                         7043 non-null   float64
 3   TotalCharges                           7043 non-null   float64
 4   Churn                                  7043 non-null   int64  
 5   CustomerReview                         7043 non-null   object 
 6   HF_neg                                 7043 non-null   float32
 7   HF_nue                                 7043 non-null   float32
 8   HF_pos                                 7043 non-null   float32
 9   gender_Male                          

In [11]:
# Display head to see new columns (optional)
print("\nDataFrame head after tenure buckets:")
print(df_feng_processed.head())


DataFrame head after tenure buckets:
   SeniorCitizen  tenure  MonthlyCharges  TotalCharges  Churn  \
0              0       1           29.85         29.85      0   
1              0      34           56.95       1889.50      0   
2              0       2           53.85        108.15      1   
3              0      45           42.30       1840.75      0   
4              0       2           70.70        151.65      1   

                                      CustomerReview    HF_neg    HF_nue  \
0  Really happy with my plan... no unexpected fee...  0.005294  0.021416   
1  Really happy with my plan... phone support was...  0.004702  0.012738   
2  Unfortunately, my service was... frustrated by...  0.906199  0.086496   
3  Consistently good signal... data plan is a gre...  0.004989  0.034239   
4  Not satisfied with the billing... billing erro...  0.930403  0.060820   

     HF_pos  gender_Male  ...  PaymentMethod_Credit card (automatic)  \
0  0.973291        False  ...             

---
Pre-Processing / Cleaning for Training and Testing Splits

In [12]:
# Drop redundant columns
# Drop original 'tenure' column if it exists
columns_to_drop_after_fe = []
if 'tenure' in df_feng_processed.columns:
    columns_to_drop_after_fe.append('tenure')
if 'CustomerReview' in df_feng_processed.columns:
    columns_to_drop_after_fe.append('CustomerReview')

if columns_to_drop_after_fe:
    df_for_modeling = df_feng_processed.drop(columns=columns_to_drop_after_fe)
    print(f"Dropped original columns: {columns_to_drop_after_fe}")
    print("df_for_modeling was created.")
else:
    df_for_modeling = df_feng_processed.copy() # Create a copy if columns were already dropped
    print("No specified original columns to drop found; using a copy for df_for_modeling.")


Dropped original columns: ['tenure', 'CustomerReview']
df_for_modeling was created.


In [13]:
print("DataFrame info after dropping redundant columns (df_for_modeling):\n")
if 'df_for_modeling' in locals():
    df_for_modeling.info()
else:
    print("df_for_modeling was not created. Please check previous steps.")

DataFrame info after dropping redundant columns (df_for_modeling):

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 33 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   SeniorCitizen                          7043 non-null   int64  
 1   MonthlyCharges                         7043 non-null   float64
 2   TotalCharges                           7043 non-null   float64
 3   Churn                                  7043 non-null   int64  
 4   HF_neg                                 7043 non-null   float32
 5   HF_nue                                 7043 non-null   float32
 6   HF_pos                                 7043 non-null   float32
 7   gender_Male                            7043 non-null   bool   
 8   Partner_Yes                            7043 non-null   bool   
 9   Dependents_Yes                         7043 non-null   bool   
 10  Phon

---
Save DataFrame **df_for_modeling** as **telco_churn_feature_engineered.parquet**

In [14]:
# Save the model-ready DataFrame
parquet_fe_file_path = r"C:\Users\comat\GitProjects\customer-churn-ai\data\telco_churn_feature_engineered.parquet"
if 'df_for_modeling' in locals():
    try:
        df_for_modeling.to_parquet(parquet_fe_file_path, index=False)
        print(f"\nFeature-engineered DataFrame successfully saved to: {parquet_fe_file_path}")
    except Exception as e:
        print(f"\nError saving feature-engineered DataFrame to Parquet: {e}")
else:
    print("\nSkipping save: df_for_modeling is not available.")


Feature-engineered DataFrame successfully saved to: C:\Users\comat\GitProjects\customer-churn-ai\data\telco_churn_feature_engineered.parquet
