In [25]:
import pandas as pd # For DataFrame manipulation (assuming df_clean is loaded)
import numpy as np # For numerical operations, especially for handling arrays

In [26]:
parquet_file_path = r"C:\Users\comat\GitProjects\customer-churn-ai\data\telco_churn_with_hf_sentiment.parquet"

try:
    df_feng = pd.read_parquet(parquet_file_path)
    print("Data Loaded Successfully! Sensational!\n")
    print(f"Shape of the loaded DataFrame: {df_feng.shape}\n")
except FileNotFoundError:
    print(f"Error: File not found at {parquet_file_path}. Please check the path and try again.")
except Exception as e:
    print(f"An error occurred: {e}")

# Display info to confirm contents and dtypes
if 'df_feng' in locals(): # Check if df_fe was loaded
    df_feng.info()

Data Loaded Successfully! Sensational!

Shape of the loaded DataFrame: (7043, 26)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contra

#### FE Part 1: Convert Text Fields
***Step 1:*** Identify Categorical Features and Convert Text Fields  
- Object Columns:  
    - customerID, gender, Partner, Dependents, PhoneService, MultipleLines, InternetService, OnlineSecurity, OnlineBackup, DeviceProtection, TechSupport, StreamingTV, StreamingMovies, Contract, PaperlessBilling, PaymentMethod, Churn, CustomerReview, HF_Label.
          
***Step 2:*** Drop Non-Informative Columns
- Drop `customerID`: Usually not a predictive feature. Drop it for modeling

In [27]:
if 'customerID' in df_feng.columns:
    df_feng_processed = df_feng.drop(columns=["customerID"], axis=1) # Drop customerID column
    print("Dropped 'customerID' column.")
else:
    df_feng_processed = df_feng.copy() # Create a copy to avoid modifying original df_feng if customerID wasn't there
    print("'customerID' column not found.")

Dropped 'customerID' column.


***Step 3:*** Encode the Target Variable Churn
- Best practice to handle `Churn` with a direct .map({'Yes': 1, 'No': 0}) because it's the target variable.

In [28]:
if 'Churn' in df_feng_processed.columns:
    df_feng_processed['Churn'] = df_feng_processed['Churn'].map({'Yes': 1, 'No': 0}) # Convert Churn to binary
    print("Encoded 'Churn' column to binary values, 1 (Yes) and 0 (No).\n")
    #Verify
    print(df_feng_processed['Churn'].value_counts())
else:
    print("\nError: 'Churn' column not found in the DataFrame.")

Encoded 'Churn' column to binary values, 1 (Yes) and 0 (No).

Churn
0    5174
1    1869
Name: count, dtype: int64


In [29]:
df_feng.head(5) # Display the first 5 rows of the DataFrame

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,CustomerReview,HF_neg,HF_nue,HF_pos,HF_Label
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,Yes,Electronic check,29.85,29.85,No,Really happy with my plan... no unexpected fee...,0.005294,0.021416,0.973291,Positive
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,No,Mailed check,56.95,1889.5,No,Really happy with my plan... phone support was...,0.004702,0.012738,0.98256,Positive
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,Yes,Mailed check,53.85,108.15,Yes,"Unfortunately, my service was... frustrated by...",0.906199,0.086496,0.007306,Negative
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,No,Bank transfer (automatic),42.3,1840.75,No,Consistently good signal... data plan is a gre...,0.004989,0.034239,0.960772,Positive
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,Yes,Electronic check,70.7,151.65,Yes,Not satisfied with the billing... billing erro...,0.930403,0.06082,0.008777,Negative
