## Importando bibliotecas

In [1]:
import pandas as pd
import numpy as np

import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import spearmanr,pearsonr,kendalltau


## Lendo e tratando dados

In [None]:
temp = pd.read_csv("PNS_2019/PNS_2019.csv",delimiter=';')
temp.to_pickle("PNS_2019/PNS_2019.pkl")

pnsdata = pd.read_pickle("PNS_2019/PNS_2019.pkl")
pnsdata.dropna(subset=['C008','I00102'],inplace=True)

In [3]:
pnsdata['sinistro'] = pnsdata['J012'].apply(lambda x: 1 if x >= 3 else 0)
pnsdata["cobertura_plano"]=pnsdata.I00102.sub(2).mul(-1).astype(int)


bins = [0, 18, 23, 28, 33, 38, 43, 48, 53, 59, float('inf')]
labels = ['0-18', '19-23', '24-28', '29-33', '34-38', '39-43', '44-48', '49-53', '54-59', '59+']


# Criando Faixas Etárias
pnsdata['age_group'] = pd.cut(pnsdata['C008'], bins=bins, labels=labels, right=False)


model1 = smf.probit(formula="cobertura_plano ~ C(age_group)", data=pnsdata)

result1 = model1.fit()
result1.summary()


model2 = smf.probit(formula="sinistro ~ C(age_group)", data=pnsdata)
result2 = model2.fit()
result2.summary()

Optimization terminated successfully.
         Current function value: 0.510416
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.640058
         Iterations 5


0,1,2,3
Dep. Variable:,sinistro,No. Observations:,279382.0
Model:,Probit,Df Residuals:,279372.0
Method:,MLE,Df Model:,9.0
Date:,"Mon, 24 Jun 2024",Pseudo R-squ.:,0.01872
Time:,19:50:59,Log-Likelihood:,-178820.0
converged:,True,LL-Null:,-182230.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.4558,0.005,-93.656,0.000,-0.465,-0.446
C(age_group)[T.19-23],-0.2007,0.010,-19.398,0.000,-0.221,-0.180
C(age_group)[T.24-28],-0.1082,0.011,-10.299,0.000,-0.129,-0.088
C(age_group)[T.29-33],-0.0356,0.011,-3.389,0.001,-0.056,-0.015
C(age_group)[T.34-38],-0.0063,0.010,-0.615,0.539,-0.026,0.014
C(age_group)[T.39-43],0.0134,0.010,1.309,0.191,-0.007,0.034
C(age_group)[T.44-48],0.0679,0.011,6.429,0.000,0.047,0.089
C(age_group)[T.49-53],0.1648,0.011,15.508,0.000,0.144,0.186
C(age_group)[T.54-59],0.2593,0.010,25.543,0.000,0.239,0.279


In [4]:
residuals1 = result1.resid_generalized
residuals2 = result2.resid_generalized

(sum(residuals1*residuals2)**2)/sum((residuals1**2)*(residuals2**2))

6092.663716235881

## Variáveis nao utilizadas - Cigarro

In [5]:
#Definindo variável

pnsdata.dropna(subset=["P050"],inplace=True)

def map_smoking_status(row):
    if row['P050'] == 1 or row['P051'] == 1:
        return 1  # Fumante ou ex-fumante
    elif row['P050'] == 2:
        return 1  # Fumante ou ex-fumante
    else:
        return 0  # Não fumante

# Aplicar a função ao DataFrame
pnsdata['smoking_status'] = pnsdata.apply(map_smoking_status, axis=1)

pnsdata.smoking_status.value_counts().sum()

90846

In [6]:
pnsdata['sinistro'] = pnsdata['J012'].apply(lambda x: 1 if x >= 3 else 0)
pnsdata["cobertura_plano"]=pnsdata.I00102.sub(2).mul(-1).astype(int)

import statsmodels.api as sm
import statsmodels.formula.api as smf



bins = [0, 18, 23, 28, 33, 38, 43, 48, 53, 59, float('inf')]
labels = ['0-18', '19-23', '24-28', '29-33', '34-38', '39-43', '44-48', '49-53', '54-59', '59+']

# Criando faixas etárias
pnsdata['age_group'] = pd.cut(pnsdata['C008'], bins=bins, labels=labels, right=False)



model1 = smf.probit(formula="cobertura_plano ~ C(age_group) + smoking_status", data=pnsdata)

result1 = model1.fit()
result1.summary()


model2 = smf.probit(formula="sinistro ~ C(age_group) + smoking_status", data=pnsdata)
result2 = model2.fit()
result2.summary()


residuals1 = result1.resid_generalized
residuals2 = result2.resid_generalized

correlation_coefficient, p_value = kendalltau(residuals1, residuals2)
correlation_coefficient
(sum(residuals1*residuals2)**2)/sum((residuals1**2)*(residuals2**2))

Optimization terminated successfully.
         Current function value: 0.528828
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.658040
         Iterations 5


1657.657686437601

## Random Binary

In [7]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy.stats import pearsonr, spearmanr, kendalltau


random_binary = np.random.randint(2, size=len(pnsdata))
random_binary2 = np.random.randint(2, size=len(pnsdata))

pnsdata['random_binary'] = random_binary
pnsdata['random_binary2'] = random_binary2


bins = [0, 18, 23, 28, 33, 38, 43, 48, 53, 59, float('inf')]
labels = ['0-18', '19-23', '24-28', '29-33', '34-38', '39-43', '44-48', '49-53', '54-59', '59+']



# Criando faixas etárias
pnsdata['age_group'] = pd.cut(pnsdata['C008'], bins=bins, labels=labels, right=False)

model1 = smf.probit(formula="cobertura_plano ~ C(age_group) + random_binary", data=pnsdata)
result1 = model1.fit()
result1.summary()

model2 = smf.probit(formula="sinistro ~ C(age_group) + random_binary", data=pnsdata)
result2 = model2.fit()
result2.summary()
from scipy.stats import spearmanr,pearsonr,kendalltau


residuals1 = result1.resid_generalized
residuals2 = result2.resid_generalized

correlation_coefficient, p_value = kendalltau(residuals1, residuals2)
correlation_coefficient
(sum(residuals1*residuals2)**2)/sum((residuals1**2)*(residuals2**2))

result1.summary()

Optimization terminated successfully.
         Current function value: 0.532037
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.659442
         Iterations 5


0,1,2,3
Dep. Variable:,cobertura_plano,No. Observations:,90846.0
Model:,Probit,Df Residuals:,90835.0
Method:,MLE,Df Model:,10.0
Date:,"Mon, 24 Jun 2024",Pseudo R-squ.:,0.005344
Time:,19:51:06,Log-Likelihood:,-48333.0
converged:,True,LL-Null:,-48593.0
Covariance Type:,nonrobust,LLR p-value:,3.1319999999999997e-105

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-1.0477,0.032,-32.368,0.000,-1.111,-0.984
C(age_group)[T.19-23],0.0304,0.038,0.800,0.424,-0.044,0.105
C(age_group)[T.24-28],0.1327,0.037,3.624,0.000,0.061,0.204
C(age_group)[T.29-33],0.2865,0.036,8.052,0.000,0.217,0.356
C(age_group)[T.34-38],0.3458,0.035,9.854,0.000,0.277,0.415
C(age_group)[T.39-43],0.3510,0.035,9.978,0.000,0.282,0.420
C(age_group)[T.44-48],0.2982,0.036,8.382,0.000,0.229,0.368
C(age_group)[T.49-53],0.2795,0.036,7.828,0.000,0.210,0.349
C(age_group)[T.54-59],0.3236,0.035,9.233,0.000,0.255,0.392


## DIABETES E OUTRAS FAIXAS DE IDADE

In [8]:
pnsdata.dropna(subset=["Q03001"],inplace=True)

def map_diabetes_status(row):
    if row['Q03001'] == 1:
        return 1  
    else:
        return 0 

pnsdata['diabetes_status'] = pnsdata.apply(map_diabetes_status, axis=1)

pnsdata.diabetes_status.value_counts()

0    76699
1     7374
Name: diabetes_status, dtype: int64

In [9]:
pnsdata['sinistro'] = pnsdata['J012'].apply(lambda x: 1 if x >= 3 else 0)
pnsdata["cobertura_plano"]=pnsdata.I00102.sub(2).mul(-1).astype(int)


bins = [0, 18, 23, 28, 33, 38, 43, 48, 53, 59, 63, 69, 73, 79, 83, 89, float('inf')]
labels = ['0-18', '19-23', '24-28', '29-33', '34-38', '39-43', '44-48', '49-53', '54-59', '60-63', '64-69', '70-73', '74-79', '80-83', '84-89', '90+']


# Criando faixas etárias
pnsdata['age_group'] = pd.cut(pnsdata['C008'], bins=bins, labels=labels, right=False)



model1 = smf.probit(formula="cobertura_plano ~ C(age_group) + diabetes_status", data=pnsdata)
result1 = model1.fit()
result1.summary()

model2 = smf.probit(formula="sinistro ~ C(age_group) + diabetes_status", data=pnsdata)
result2 = model2.fit()
result2.summary()


residuals1 = result1.resid_generalized
residuals2 = result2.resid_generalized

correlation_coefficient, p_value = kendalltau(residuals1, residuals2)
correlation_coefficient
(sum(residuals1*residuals2)**2)/sum((residuals1**2)*(residuals2**2))
#result2.summary()

Optimization terminated successfully.
         Current function value: 0.549052
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.660600
         Iterations 4


1442.922796829005

In [10]:
result2.summary()

0,1,2,3
Dep. Variable:,sinistro,No. Observations:,84073.0
Model:,Probit,Df Residuals:,84056.0
Method:,MLE,Df Model:,16.0
Date:,"Mon, 24 Jun 2024",Pseudo R-squ.:,0.02786
Time:,19:51:11,Log-Likelihood:,-55539.0
converged:,True,LL-Null:,-57130.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.6105,0.032,-18.904,0.000,-0.674,-0.547
C(age_group)[T.19-23],0.1744,0.038,4.642,0.000,0.101,0.248
C(age_group)[T.24-28],0.1955,0.036,5.385,0.000,0.124,0.267
C(age_group)[T.29-33],0.2271,0.036,6.381,0.000,0.157,0.297
C(age_group)[T.34-38],0.2344,0.035,6.662,0.000,0.165,0.303
C(age_group)[T.39-43],0.2308,0.035,6.548,0.000,0.162,0.300
C(age_group)[T.44-48],0.3005,0.035,8.473,0.000,0.231,0.370
C(age_group)[T.49-53],0.3607,0.035,10.163,0.000,0.291,0.430
C(age_group)[T.54-59],0.4268,0.035,12.207,0.000,0.358,0.495
