In [1]:
# Importación de librerías necesarias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from datetime import datetime
import warnings

warnings.filterwarnings("ignore")

# Configuración de visualización
plt.style.use("default")
sns.set_palette("husl")
plt.rcParams["figure.figsize"] = (12, 8)
plt.rcParams["font.size"] = 12

print("Librerías importadas correctamente")
print(f" Pandas versión: {pd.__version__}")
print(f" NumPy versión: {np.__version__}")

Librerías importadas correctamente
 Pandas versión: 2.2.3
 NumPy versión: 2.2.0


#📌 Extracción

In [None]:
# Carga del dataset desde JSON
with open("TelecomX_Data.json", "r") as f:
    data = json.load(f)

df_raw = pd.DataFrame(data)

print(f" Dataset cargado exitosamente")
print(f" Dimensiones: {df_raw.shape[0]:,} filas × {df_raw.shape[1]} columnas")
print(f" Memoria utilizada: {df_raw.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print("\n primeras 5 filas del dataset:")
df_raw.head()

 Dataset cargado exitosamente
 Dimensiones: 7,267 filas × 6 columnas
 Memoria utilizada: 6.70 MB

 primeras 5 filas del dataset:


Unnamed: 0,customerID,Churn,customer,phone,internet,account
0,0002-ORFBO,No,"{'gender': 'Female', 'SeniorCitizen': 0, 'Part...","{'PhoneService': 'Yes', 'MultipleLines': 'No'}","{'InternetService': 'DSL', 'OnlineSecurity': '...","{'Contract': 'One year', 'PaperlessBilling': '..."
1,0003-MKNFE,No,"{'gender': 'Male', 'SeniorCitizen': 0, 'Partne...","{'PhoneService': 'Yes', 'MultipleLines': 'Yes'}","{'InternetService': 'DSL', 'OnlineSecurity': '...","{'Contract': 'Month-to-month', 'PaperlessBilli..."
2,0004-TLHLJ,Yes,"{'gender': 'Male', 'SeniorCitizen': 0, 'Partne...","{'PhoneService': 'Yes', 'MultipleLines': 'No'}","{'InternetService': 'Fiber optic', 'OnlineSecu...","{'Contract': 'Month-to-month', 'PaperlessBilli..."
3,0011-IGKFF,Yes,"{'gender': 'Male', 'SeniorCitizen': 1, 'Partne...","{'PhoneService': 'Yes', 'MultipleLines': 'No'}","{'InternetService': 'Fiber optic', 'OnlineSecu...","{'Contract': 'Month-to-month', 'PaperlessBilli..."
4,0013-EXCHZ,Yes,"{'gender': 'Female', 'SeniorCitizen': 1, 'Part...","{'PhoneService': 'Yes', 'MultipleLines': 'No'}","{'InternetService': 'Fiber optic', 'OnlineSecu...","{'Contract': 'Month-to-month', 'PaperlessBilli..."


In [3]:
# Información general del dataset
print("Información general del dataset:")
print(df_raw.info())
print("\n" + "=" * 60)

# Examinar estructura de columnas anidadas
print("\n estructura de datos anidados:")
print("\n columna customer (información demográfica):")
print(json.dumps(df_raw["customer"].iloc[0], indent=2))

print("\n columna phone (servicios telefónicos):")
print(json.dumps(df_raw["phone"].iloc[0], indent=2))

print("\n columna internet (servicios de internet):")
print(json.dumps(df_raw["internet"].iloc[0], indent=2))

print("\n columna account (información de cuenta):")
print(json.dumps(df_raw["account"].iloc[0], indent=2))

Información general del dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7267 entries, 0 to 7266
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   customerID  7267 non-null   object
 1   Churn       7267 non-null   object
 2   customer    7267 non-null   object
 3   phone       7267 non-null   object
 4   internet    7267 non-null   object
 5   account     7267 non-null   object
dtypes: object(6)
memory usage: 340.8+ KB
None


 estructura de datos anidados:

 columna customer (información demográfica):
{
  "gender": "Female",
  "SeniorCitizen": 0,
  "Partner": "Yes",
  "Dependents": "Yes",
  "tenure": 9
}

 columna phone (servicios telefónicos):
{
  "PhoneService": "Yes",
  "MultipleLines": "No"
}

 columna internet (servicios de internet):
{
  "InternetService": "DSL",
  "OnlineSecurity": "No",
  "OnlineBackup": "Yes",
  "DeviceProtection": "No",
  "TechSupport": "Yes",
  "StreamingTV": "Yes",
  "StreamingMov

#🔧 Transformación

In [4]:
# Función para expandir columnas anidadas
def expand_nested_column(df, column_name):
    """Expande una columna con datos anidados en múltiples columnas"""
    expanded = pd.json_normalize(df[column_name])
    expanded.columns = [f"{column_name}_{col}" for col in expanded.columns]
    return expanded


# Expandir todas las columnas anidadas
customer_expanded = expand_nested_column(df_raw, "customer")
phone_expanded = expand_nested_column(df_raw, "phone")
internet_expanded = expand_nested_column(df_raw, "internet")
account_expanded = expand_nested_column(df_raw, "account")

# Crear dataset final combinando todas las columnas
df = pd.concat(
    [
        df_raw[["customerID", "Churn"]],
        customer_expanded,
        phone_expanded,
        internet_expanded,
        account_expanded,
    ],
    axis=1,
)

print(f" Datos transformados exitosamente")
print(f" Nuevas dimensiones: {df.shape[0]:,} filas × {df.shape[1]} columnas")
print(f"\n Columnas del dataset transformado:")
for i, col in enumerate(df.columns, 1):
    print(f"{i:2d}. {col}")

 Datos transformados exitosamente
 Nuevas dimensiones: 7,267 filas × 21 columnas

 Columnas del dataset transformado:
 1. customerID
 2. Churn
 3. customer_gender
 4. customer_SeniorCitizen
 5. customer_Partner
 6. customer_Dependents
 7. customer_tenure
 8. phone_PhoneService
 9. phone_MultipleLines
10. internet_InternetService
11. internet_OnlineSecurity
12. internet_OnlineBackup
13. internet_DeviceProtection
14. internet_TechSupport
15. internet_StreamingTV
16. internet_StreamingMovies
17. account_Contract
18. account_PaperlessBilling
19. account_PaymentMethod
20. account_Charges.Monthly
21. account_Charges.Total


In [6]:
# Limpieza inicial
print("Distribución de la variable objetivo (churn):")
churn_counts = df["Churn"].value_counts()
print(churn_counts)
print(f"\n Porcentaje de distribución:")
churn_pct = df["Churn"].value_counts(normalize=True) * 100
print(churn_pct.round(2))

print(f'\n Valores faltantes en Churn antes de limpieza: {df["Churn"].isna().sum()}')
print(f' Valores vacíos ("") en Churn: {(df["Churn"] == "").sum()}')

df_clean = df[df["Churn"].isin(["Yes", "No"])].copy()
print(f"\n Dataset limpio: {df_clean.shape[0]:,} filas × {df_clean.shape[1]} columnas")
print(f" Filas eliminadas: {df.shape[0] - df_clean.shape[0]:,}")

Distribución de la variable objetivo (churn):
Churn
No     5174
Yes    1869
        224
Name: count, dtype: int64

 Porcentaje de distribución:
Churn
No     71.20
Yes    25.72
        3.08
Name: proportion, dtype: float64

 Valores faltantes en Churn antes de limpieza: 0
 Valores vacíos ("") en Churn: 224

 Dataset limpio: 7,043 filas × 21 columnas
 Filas eliminadas: 224


In [7]:
# Converción de tipos de datos
print('Convirtiendo tipos de datos...')

df_clean['Churn'] = df_clean['Churn'].map({'Yes': 1, 'No': 0})

df_clean['customer_SeniorCitizen'] = df_clean['customer_SeniorCitizen'].astype('category')

yes_no_columns = [col for col in df_clean.columns if df_clean[col].dtype == 'object' and
                  set(df_clean[col].unique()).issubset({'Yes', 'No', 'No internet service', 'No phone service'})]

print(f'\n Columnas Yes/No identificadas: {len(yes_no_columns)}')
for col in yes_no_columns:
    print(f' - {col}: {df_clean[col].unique()}')

for col in yes_no_columns:
    df_clean[col] = df_clean[col].map({
        'Yes': 1,
        'No': 0,
        'No internet service': 0,
        'No phone service': 0
    })

if 'account_Charges.Total' in df_clean.columns:
    df_clean['account_Charges.Total'] = pd.to_numeric(df_clean['account_Charges.Total'], errors='coerce')

print('\n conversión de tipos completada')
print('\n información del dataset limpio:')
print(df_clean.info())


Convirtiendo tipos de datos...

 Columnas Yes/No identificadas: 11
 - customer_Partner: ['Yes' 'No']
 - customer_Dependents: ['Yes' 'No']
 - phone_PhoneService: ['Yes' 'No']
 - phone_MultipleLines: ['No' 'Yes' 'No phone service']
 - internet_OnlineSecurity: ['No' 'Yes' 'No internet service']
 - internet_OnlineBackup: ['Yes' 'No' 'No internet service']
 - internet_DeviceProtection: ['No' 'Yes' 'No internet service']
 - internet_TechSupport: ['Yes' 'No' 'No internet service']
 - internet_StreamingTV: ['Yes' 'No' 'No internet service']
 - internet_StreamingMovies: ['No' 'Yes' 'No internet service']
 - account_PaperlessBilling: ['Yes' 'No']

 conversión de tipos completada

 información del dataset limpio:
<class 'pandas.core.frame.DataFrame'>
Index: 7043 entries, 0 to 7266
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   customerID                 7043 non-null   object  
 1   Churn     

#📊 Carga y análisis

#📄Informe final