# Powerset model

[linear regression dataset](https://www.telusinternational.com/insights/ai-data/article/10-open-datasets-for-linear-regression)

[logistic regression dataset](https://sushtend.com/machine-learning/datasets-for-practicing-logistic-regression/)

[Machine learning with R datasets](https://github.com/stedy/Machine-Learning-with-R-datasets)

## Linear regression

In [1]:
# Set repository
import os
os.chdir("D:/Bureau/PythonProject/packages/scientistmetrics/data/")

In [2]:
# warnings message
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Load dataset
import pandas as pd
import numpy as np 

insurance = pd.read_csv("insurance.csv",sep=",")
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
insurance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [5]:
# Powerset
from scientistmetrics.model import powersetmodel

## [Insurance dataset](https://www.kaggle.com/datasets/mirichoi0218/insurance)

In [6]:
# Powerset model
ols_res = powersetmodel(DTrain=insurance,target="charges")
ols_model = ols_res[0]

In [7]:
ols_metrics = ols_res[1]
print(ols_metrics)

                             predictor  count           aic           bic  \
0                                  age      1  20201.288826  20210.972057   
1                                  sex      1  20274.613948  20284.297179   
2                                  bmi      1  20237.360519  20247.043750   
3                             children      1  20272.126696  20281.809927   
4                               smoker      1  19324.119392  19333.802623   
..                                 ...    ...           ...           ...   
58           age+region+smoker+sex+bmi      5  18954.761838  18993.494762   
59      smoker+age+region+sex+children      5  19049.454169  19088.187093   
60      smoker+age+region+children+bmi      5  18944.964095  18983.697019   
61      smoker+region+sex+children+bmi      5  19220.574191  19259.307115   
62  age+smoker+region+sex+children+bmi      6  18946.848573  18990.423112   

    rsquared  adj. rsquared  expl. var. score     max error  mean abs. erro

## [Life Expectancy](https://www.kaggle.com/datasets/kumarajarshi/life-expectancy-who)

In [8]:
# Load dataset
lifexp = pd.read_csv("Life Expectancy Data.csv",sep=",")
lifexp.head()

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [9]:
# Size of the data 
lifexp.shape

(2938, 22)

In [10]:
# A Quick Information about the Data
lifexp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          2938 non-null   object 
 1   Year                             2938 non-null   int64  
 2   Status                           2938 non-null   object 
 3   Life expectancy                  2928 non-null   float64
 4   Adult Mortality                  2928 non-null   float64
 5   infant deaths                    2938 non-null   int64  
 6   Alcohol                          2744 non-null   float64
 7   percentage expenditure           2938 non-null   float64
 8   Hepatitis B                      2385 non-null   float64
 9   Measles                          2938 non-null   int64  
 10   BMI                             2904 non-null   float64
 11  under-five deaths                2938 non-null   int64  
 12  Polio               

In [11]:
# Checking for Null Values
lifexp.isnull().sum()

Country                              0
Year                                 0
Status                               0
Life expectancy                     10
Adult Mortality                     10
infant deaths                        0
Alcohol                            194
percentage expenditure               0
Hepatitis B                        553
Measles                              0
 BMI                                34
under-five deaths                    0
Polio                               19
Total expenditure                  226
Diphtheria                          19
 HIV/AIDS                            0
GDP                                448
Population                         652
 thinness  1-19 years               34
 thinness 5-9 years                 34
Income composition of resources    167
Schooling                          163
dtype: int64

In [12]:
# Replacing the Null Values with mean values of the data
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(missing_values=np.nan,strategy='mean',fill_value=None)
for name in lifexp.columns:
    if lifexp[name].isnull().sum() > 0:
        lifexp[name]=imputer.fit_transform(lifexp[[name]])

In [13]:
lifexp.isnull().sum()

Country                            0
Year                               0
Status                             0
Life expectancy                    0
Adult Mortality                    0
infant deaths                      0
Alcohol                            0
percentage expenditure             0
Hepatitis B                        0
Measles                            0
 BMI                               0
under-five deaths                  0
Polio                              0
Total expenditure                  0
Diphtheria                         0
 HIV/AIDS                          0
GDP                                0
Population                         0
 thinness  1-19 years              0
 thinness 5-9 years                0
Income composition of resources    0
Schooling                          0
dtype: int64

[here](https://www.kaggle.com/code/varunsaikanuri/life-expectancy-visualization)

In [14]:
# Replace space with blanc
lifexp.columns = lifexp.columns.str.replace(' ', '').str.replace('-', 'to').str.replace('/', '')
lifexp.columns

Index(['Country', 'Year', 'Status', 'Lifeexpectancy', 'AdultMortality',
       'infantdeaths', 'Alcohol', 'percentageexpenditure', 'HepatitisB',
       'Measles', 'BMI', 'undertofivedeaths', 'Polio', 'Totalexpenditure',
       'Diphtheria', 'HIVAIDS', 'GDP', 'Population', 'thinness1to19years',
       'thinness5to9years', 'Incomecompositionofresources', 'Schooling'],
      dtype='object')

In [15]:
ols_res = powersetmodel(DTrain=lifexp,target="Lifeexpectancy")
ols_model = ols_res[0]

## Logistic regression

## [Diabetes]()

In [None]:
# Load datasets
diabetes = pd.read_csv("diabetes.csv",sep=",")
diabetes.info()

In [None]:
glm_res = powersetmodel(DTrain=diabetes,split_data=False,target="Outcome",model_type = "logistic",num_from=2,num_to=3)
glm_model = glm_res[0]

In [None]:
glm_metrics = glm_res[1]
print(glm_metrics)