## EDA & Data Cleaning

In [66]:
import pandas as pd

# Cargar el dataset
df_demo = pd.read_csv('df_final_demo.txt')


In [67]:
df_demo.shape

(70609, 9)

In [68]:
# Ver las primeras filas
df_demo.head()

Unnamed: 0,client_id,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth
0,836976,6.0,73.0,60.5,U,2.0,45105.3,6.0,9.0
1,2304905,7.0,94.0,58.0,U,2.0,110860.3,6.0,9.0
2,1439522,5.0,64.0,32.0,U,2.0,52467.79,6.0,9.0
3,1562045,16.0,198.0,49.0,M,2.0,67454.65,3.0,6.0
4,5126305,12.0,145.0,33.0,F,2.0,103671.75,0.0,3.0


In [69]:
# Comprobar duplicados en la columna 'client_id'
duplicados = df_demo[df_demo.duplicated(subset='client_id', keep=False)]

# Contar el número de duplicados
num_duplicados = duplicados.shape[0]

print(f"Número de registros duplicados en 'client_id': {num_duplicados}")
print("Registros duplicados:")
print(duplicados)


Número de registros duplicados en 'client_id': 0
Registros duplicados:
Empty DataFrame
Columns: [client_id, clnt_tenure_yr, clnt_tenure_mnth, clnt_age, gendr, num_accts, bal, calls_6_mnth, logons_6_mnth]
Index: []


In [70]:
# Información general del DataFrame
df_demo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70609 entries, 0 to 70608
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   client_id         70609 non-null  int64  
 1   clnt_tenure_yr    70595 non-null  float64
 2   clnt_tenure_mnth  70595 non-null  float64
 3   clnt_age          70594 non-null  float64
 4   gendr             70595 non-null  object 
 5   num_accts         70595 non-null  float64
 6   bal               70595 non-null  float64
 7   calls_6_mnth      70595 non-null  float64
 8   logons_6_mnth     70595 non-null  float64
dtypes: float64(7), int64(1), object(1)
memory usage: 4.8+ MB


In [71]:
# Estadísticas descriptivas
df_demo.describe(include='all')

Unnamed: 0,client_id,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth
count,70609.0,70595.0,70595.0,70594.0,70595,70595.0,70595.0,70595.0,70595.0
unique,,,,,4,,,,
top,,,,,U,,,,
freq,,,,,24122,,,,
mean,5004992.0,12.05295,150.659367,46.44224,,2.255528,147445.2,3.382478,5.56674
std,2877278.0,6.871819,82.089854,15.591273,,0.534997,301508.7,2.23658,2.353286
min,169.0,2.0,33.0,13.5,,1.0,13789.42,0.0,1.0
25%,2519329.0,6.0,82.0,32.5,,2.0,37346.83,1.0,4.0
50%,5016978.0,11.0,136.0,47.0,,2.0,63332.9,3.0,5.0
75%,7483085.0,16.0,192.0,59.0,,2.0,137544.9,6.0,7.0


In [72]:
# Comprobar datos faltantes
missing_data = df_demo.isnull().sum()
print(missing_data)


client_id            0
clnt_tenure_yr      14
clnt_tenure_mnth    14
clnt_age            15
gendr               14
num_accts           14
bal                 14
calls_6_mnth        14
logons_6_mnth       14
dtype: int64


In [73]:
# Eliminar filas con datos faltantes en columnas específicas y actualizar df_demo
df_demo.dropna(subset=['clnt_tenure_yr', 'clnt_tenure_mnth', 'clnt_age', 'gendr', 'num_accts', 'bal', 'calls_6_mnth', 'logons_6_mnth'], inplace=True)

# Verificar que se eliminaron los datos faltantes
df_demo.isnull().sum()


client_id           0
clnt_tenure_yr      0
clnt_tenure_mnth    0
clnt_age            0
gendr               0
num_accts           0
bal                 0
calls_6_mnth        0
logons_6_mnth       0
dtype: int64

In [74]:
# Comprobar valores únicos
unique_genders = df_demo['gendr'].unique()
print("\nValores únicos en la columna 'gendr':")
print(unique_genders)


Valores únicos en la columna 'gendr':
['U' 'M' 'F' 'X']


In [75]:
# Contar cuántas veces aparece 'X' en la columna 'gendr'
count_x = (df_demo['gendr'] == 'X').sum()
print(f"Cantidad de valores 'X' en la columna 'gendr': {count_x}")


Cantidad de valores 'X' en la columna 'gendr': 3


In [76]:
# Eliminar filas donde 'gendr' es 'X'
df_demo = df_demo[df_demo['gendr'] != 'X']


## Para comprobar si hay datos atípicos en la columna clnt_age, puedes calcular los límites intercuartílicos (IQR). 

In [77]:
# Calcular Q1 (primer cuartil) y Q3 (tercer cuartil)
Q1 = df_demo['clnt_age'].quantile(0.25)
Q3 = df_demo['clnt_age'].quantile(0.75)
IQR = Q3 - Q1

# Definir límites inferior y superior
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identificar valores atípicos
outliers = df_demo[(df_demo['clnt_age'] < lower_bound) | (df_demo['clnt_age'] > upper_bound)]

# Mostrar cantidad de valores atípicos
print(f'Cantidad de valores atípicos en clnt_age: {outliers.shape[0]}')


Cantidad de valores atípicos en clnt_age: 0


##  Análisis de la Demografía de los Clientes

In [78]:
import pandas as pd

# Cargar el DataFrame (si no está cargado)
# df_demo = pd.read_csv('ruta_al_archivo.csv')

# Análisis de la edad de los clientes
age_distribution = df_demo['clnt_age'].describe()

# Distribución de género
gender_distribution = df_demo['gendr'].value_counts()

# Número de cuentas por cliente
account_distribution = df_demo['num_accts'].value_counts()

age_distribution, gender_distribution, account_distribution


(count    70591.000000
 mean        46.442542
 std         15.591381
 min         13.500000
 25%         32.500000
 50%         47.000000
 75%         59.000000
 max         96.000000
 Name: clnt_age, dtype: float64,
 gendr
 U    24122
 M    23724
 F    22745
 Name: count, dtype: int64,
 num_accts
 2.0    55494
 3.0    12528
 4.0     2241
 5.0      284
 6.0       33
 7.0        8
 1.0        2
 8.0        1
 Name: count, dtype: int64)

In [79]:
# Edad promedio
average_age = df_demo['clnt_age'].mean()

# Tenencia promedio
average_tenure = df_demo['clnt_tenure_yr'].mean()

# Agrupar por edad y tenencia
age_tenure_analysis = df_demo.groupby(['clnt_tenure_yr', 'clnt_age']).size().reset_index(name='counts')

average_age, average_tenure, age_tenure_analysis


(46.442542250428524,
 12.053108753240497,
       clnt_tenure_yr  clnt_age  counts
 0                2.0      14.5       1
 1                2.0      17.0       1
 2                2.0      18.5       2
 3                2.0      19.5       1
 4                2.0      21.5       1
 ...              ...       ...     ...
 4345            55.0      67.0       1
 4346            55.0      68.0       1
 4347            55.0      72.0       1
 4348            55.0      78.0       1
 4349            62.0      51.0       1
 
 [4350 rows x 3 columns])

In [80]:
# Análisis de saldo en función de la edad
age_balance_correlation = df_demo[['clnt_age', 'bal']].corr()

# Promedio de llamadas y logins por grupo de saldo
call_login_analysis = df_demo.groupby(pd.cut(df_demo['bal'], bins=[0, 100000, 500000, 1000000, 10000000])).agg({'calls_6_mnth': 'mean', 'logons_6_mnth': 'mean'})

age_balance_correlation, call_login_analysis


  call_login_analysis = df_demo.groupby(pd.cut(df_demo['bal'], bins=[0, 100000, 500000, 1000000, 10000000])).agg({'calls_6_mnth': 'mean', 'logons_6_mnth': 'mean'})


(          clnt_age       bal
 clnt_age  1.000000  0.209545
 bal       0.209545  1.000000,
                      calls_6_mnth  logons_6_mnth
 bal                                             
 (0, 100000]              3.053574       5.207998
 (100000, 500000]         3.902361       6.150634
 (500000, 1000000]        4.567957       6.771071
 (1000000, 10000000]      4.882653       7.113946)

In [81]:
correlation_age_accounts = df_demo['clnt_age'].corr(df_demo['num_accts'])
print("Correlación entre Edad y Número de Cuentas:", correlation_age_accounts)


Correlación entre Edad y Número de Cuentas: -0.017708400750834206


In [82]:
age_bins = [0, 30, 45, 60, 100]
age_labels = ['Joven', 'Adulto', 'Mayor', 'Anciano']
df_demo['age_group'] = pd.cut(df_demo['clnt_age'], bins=age_bins, labels=age_labels)

avg_accounts_by_age_group = df_demo.groupby('age_group')['num_accts'].mean()
print(avg_accounts_by_age_group)


age_group
Joven      2.264922
Adulto     2.263316
Mayor      2.254641
Anciano    2.238759
Name: num_accts, dtype: float64


  avg_accounts_by_age_group = df_demo.groupby('age_group')['num_accts'].mean()


In [83]:
demographics_balance = df_demo.groupby(['clnt_age', 'gendr'])['bal'].describe()
demographics_balance


Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
clnt_age,gendr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
13.5,U,1.0,24435.08,,24435.08,24435.0800,24435.080,24435.0800,24435.08
14.0,F,1.0,16989.14,,16989.14,16989.1400,16989.140,16989.1400,16989.14
14.0,U,1.0,19945.35,,19945.35,19945.3500,19945.350,19945.3500,19945.35
14.5,F,3.0,17103.58,1781.543845,15048.83,16546.8800,18044.930,18130.9550,18216.98
14.5,M,4.0,23983.90,7106.456638,15487.91,19442.9675,25116.555,29657.4875,30214.58
...,...,...,...,...,...,...,...,...,...
94.0,U,3.0,394455.75,485296.364159,58162.77,116286.6300,174410.490,562602.2400,950793.99
94.5,U,1.0,108825.59,,108825.59,108825.5900,108825.590,108825.5900,108825.59
95.5,F,1.0,43471.96,,43471.96,43471.9600,43471.960,43471.9600,43471.96
96.0,M,1.0,36297.66,,36297.66,36297.6600,36297.660,36297.6600,36297.66
