### Importation setup

In [17]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Explore Data

In [43]:
data = pd.read_csv("data.csv")

In [50]:
threshold = len(data)*0.04
print(threshold)

117.52


In [51]:
cols_to_drop = data.columns[data.isna().sum()<threshold]
print(cols_to_drop)

Index(['Unnamed: 0', 'Nation', 'Survey_Year', 'Country_Category',
       'Mortality_Adults', 'Infant_Deaths_Count', 'Expenditure_Percentage_GDP',
       'Measles_Infection_Count', 'Body_Mass_Index_Avg',
       'Polio_Vaccination_Coverage', 'Diphtheria_Vaccination_Coverage',
       'HIV_AIDS_Prevalence_Rate', 'Thinness', 'Life_Expectancy_Years'],
      dtype='object')


In [52]:
data.dropna(subset=cols_to_drop,inplace = True)

In [53]:
data.duplicated().value_counts()

False    2881
Name: count, dtype: int64

In [None]:
data.isna().sum()

In [None]:
data.head()

In [4]:
data.rename(columns={"Unnamed: 0":"Survey_num"}, inplace= True)

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data[["Country_Category","HIV_AIDS_Prevalence_Rate","Gross_Domestic_Product","Life_Expectancy_Years"]].groupby("Country_Category").mean()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

sns.barplot(data=data, x="Country_Category", y="HIV_AIDS_Prevalence_Rate", ax=axes[0])
axes[0].set_title("HIV Prevalence")
axes[0].tick_params(axis='x', rotation=45)

sns.barplot(data=data, x="Country_Category", y="Gross_Domestic_Product", ax=axes[1])
axes[1].set_title("GDP")
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## Cleaning Data

##### clean Nation column

In [45]:
data["Nation"].dtype

string[python]

In [48]:
"""this column is clean"""
data["Nation"].value_counts()


Nation
Syrian Arab Republic    16
Croatia                 16
Montenegro              16
Bahrain                 16
Kenya                   16
                        ..
Nauru                    1
Cook Islands             1
Palau                    1
Monaco                   1
Dominica                 1
Name: count, Length: 194, dtype: Int64

#### clean Survey_Year

In [None]:
data["Survey_Year"].isna().value_counts()

In [33]:
data["Survey_Year"]= pd.to_numeric(data["Survey_Year"],errors='coerce').astype('Int64')
data["Survey_Year"].dtype

Int64Dtype()

#### clean Country_Category

In [35]:
data["Country_Category"].isna().value_counts()

Country_Category
False    2935
True        3
Name: count, dtype: int64

#### Clean Mortality_Adults column

In [55]:
data[data["Mortality_Adults"].isna()]

Unnamed: 0.1,Unnamed: 0,Nation,Survey_Year,Country_Category,Mortality_Adults,Infant_Deaths_Count,Alcohol_Consumption_Rate,Expenditure_Percentage_GDP,Hepatitis_B_Vaccination_Coverage,Measles_Infection_Count,Body_Mass_Index_Avg,Polio_Vaccination_Coverage,Total_Health_Expenditure,Diphtheria_Vaccination_Coverage,HIV_AIDS_Prevalence_Rate,Gross_Domestic_Product,Total_Population,Thinness,Life_Expectancy_Years


In [None]:
data[data["Mortality_Adults"]>600]

######## After verification of this column, I founded that those 
######## countries of death rate > 500 is developing countries that may happen actually 

In [84]:
data.head(1)

Unnamed: 0.1,Unnamed: 0,Nation,Survey_Year,Country_Category,Mortality_Adults,Infant_Deaths_Count,Alcohol_Consumption_Rate,Expenditure_Percentage_GDP,Hepatitis_B_Vaccination_Coverage,Measles_Infection_Count,Body_Mass_Index_Avg,Polio_Vaccination_Coverage,Total_Health_Expenditure,Diphtheria_Vaccination_Coverage,HIV_AIDS_Prevalence_Rate,Gross_Domestic_Product,Total_Population,Thinness,Life_Expectancy_Years
0,0,Syrian Arab Republic,2006,Developing,123.0,8,0.97,122.652333,83.0,517,48.5,83.0,3.78,8.0,0.1,1762.24617,18914977.0,6.4,73.7


#### Clean Infant_Deaths_Count

In [66]:
data["Infant_Deaths_Count"].dtype

Int64Dtype()

In [None]:
data[(data["Infant_Deaths_Count"]>300)]

In [None]:
# Cap values to reasonable maximum (e.g., 300 per 1000)
reasonable_max = 300  # Even 300 is extremely high
data['Infant_Deaths_Count'] = data['Infant_Deaths_Count'].clip(upper=reasonable_max)

print("After clipping:")
print(data[data['Nation'].str.contains('India', case=False, na=False)][['Survey_Year', 'Infant_Deaths_Count']])

In [71]:
data["Infant_Deaths_Count"].max()

np.int64(300)

##### Clean Alcohol_Consumption_Rate column

In [78]:
data["Alcohol_Consumption_Rate"].max()

np.float64(17.87)

In [79]:
data["Alcohol_Consumption_Rate"].min()

np.float64(0.01)

In [76]:
data[(data["Alcohol_Consumption_Rate"]>15)]

Unnamed: 0.1,Unnamed: 0,Nation,Survey_Year,Country_Category,Mortality_Adults,Infant_Deaths_Count,Alcohol_Consumption_Rate,Expenditure_Percentage_GDP,Hepatitis_B_Vaccination_Coverage,Measles_Infection_Count,Body_Mass_Index_Avg,Polio_Vaccination_Coverage,Total_Health_Expenditure,Diphtheria_Vaccination_Coverage,HIV_AIDS_Prevalence_Rate,Gross_Domestic_Product,Total_Population,Thinness,Life_Expectancy_Years
56,56,Lithuania,2012,Developed,176.0,0,15.14,1807.071336,93.0,0,6.9,93.0,6.67,93.0,0.1,14341.836,2987773.0,2.7,73.0
483,483,Estonia,2007,Developing,189.0,0,17.87,1904.12469,95.0,1,56.3,95.0,5.16,95.0,0.1,16586.452,13468.0,2.0,73.0
1145,1145,Estonia,2008,Developing,167.0,0,16.99,225.072362,94.0,0,56.7,95.0,6.6,95.0,0.1,1894.5485,13379.0,2.0,74.2
1851,1851,Lithuania,2013,Developed,178.0,0,15.04,1968.816817,93.0,35,61.4,93.0,6.59,93.0,0.1,15712.82376,2957689.0,2.7,73.0
1859,1859,Estonia,2006,Developing,188.0,0,16.58,244.35108,95.0,27,55.9,95.0,5.1,95.0,0.1,12595.4165,134681.0,2.1,73.0
1975,1975,Belarus,2012,Developing,194.0,0,16.35,91.709621,97.0,10,6.5,98.0,5.1,98.0,0.1,694.243915,9464495.0,2.0,71.9
2007,2007,Lithuania,2014,Developed,169.0,0,15.19,2211.744178,94.0,11,61.9,93.0,6.55,93.0,0.1,16554.97139,2932367.0,2.6,73.4
2118,2118,Belarus,2011,Developing,232.0,0,17.31,846.911307,98.0,50,59.9,98.0,4.92,98.0,0.1,6519.71753,9473172.0,2.0,72.0
2431,2431,Estonia,2009,Developing,156.0,0,15.04,1717.088711,95.0,0,57.1,95.0,6.93,95.0,0.1,14726.31828,1334515.0,2.0,74.9
2540,2540,Estonia,2005,Developing,189.0,0,15.52,153.504526,95.0,2,55.5,96.0,5.2,96.0,0.1,1338.31322,1354775.0,2.1,72.8


In [80]:
data["Alcohol_Consumption_Rate"].isna().sum()

np.int64(174)

##### Clean Expenditure_Percentage_GDP column

In [89]:
data["Expenditure_Percentage_GDP"].dtype

dtype('float64')

In [None]:
data.loc[0:100, ['Nation', 'Expenditure_Percentage_GDP']]


In [91]:
# Only if you have GDP data to convert
if 'Gross_Domestic_Product' in data.columns:
    print("Attempting to convert dollar amounts to percentages...")
    
    # For rows with dollar amounts AND GDP data
    mask = (data['Expenditure_Percentage_GDP'] > 20) & (data['Gross_Domestic_Product'].notna())
    
    if mask.any():
        # Convert: (Health $ ÷ GDP $) × 100 = %
        data.loc[mask, 'Expenditure_Percentage_GDP'] = (
            data.loc[mask, 'Expenditure_Percentage_GDP'] / 
            data.loc[mask, 'Gross_Domestic_Product']
        ) * 100
        
        print(f"Converted {mask.sum()} rows from dollars to percentages")
        print(f"New range: {data['Expenditure_Percentage_GDP'].min():.2f}% to {data['Expenditure_Percentage_GDP'].max():.2f}%")
    else:
        print("Cannot convert - missing GDP data for dollar amount rows")

Attempting to convert dollar amounts to percentages...
Converted 1899 rows from dollars to percentages
New range: 0.00% to 34.41%


In [93]:
data.columns

Index(['Unnamed: 0', 'Nation', 'Survey_Year', 'Country_Category',
       'Mortality_Adults', 'Infant_Deaths_Count', 'Alcohol_Consumption_Rate',
       'Expenditure_Percentage_GDP', 'Hepatitis_B_Vaccination_Coverage',
       'Measles_Infection_Count', 'Body_Mass_Index_Avg',
       'Polio_Vaccination_Coverage', 'Total_Health_Expenditure',
       'Diphtheria_Vaccination_Coverage', 'HIV_AIDS_Prevalence_Rate',
       'Gross_Domestic_Product', 'Total_Population', 'Thinness',
       'Life_Expectancy_Years'],
      dtype='object')

##### clean Hepatitis_B_Vaccination_Coverage

In [95]:
data['Hepatitis_B_Vaccination_Coverage'].isna().value_counts()

Hepatitis_B_Vaccination_Coverage
False    2358
True      523
Name: count, dtype: int64

In [100]:
(data["Hepatitis_B_Vaccination_Coverage"]<30).value_counts()

Hepatitis_B_Vaccination_Coverage
False    2688
True      193
Name: count, dtype: int64

In [None]:
data['Polio_Vaccination_Coverage'].isna().value_counts()

In [99]:
(data['Polio_Vaccination_Coverage']<30).value_counts()

Polio_Vaccination_Coverage
False    2706
True      175
Name: count, dtype: int64

In [None]:
data['Diphtheria_Vaccination_Coverage'].isna().value_counts()

In [102]:
# Find rows where HepB is null but Polio and Diphtheria have values
null_hepb = data[
    data['Hepatitis_B_Vaccination_Coverage'].notna() & 
    data['Polio_Vaccination_Coverage'].notna() & 
    data['Diphtheria_Vaccination_Coverage'].notna()
]

print(f"Found {len(null_hepb)} rows where Hepatitis B is null but other vaccinations have values")
print(null_hepb[['Nation', 'Survey_Year', 
                 'Hepatitis_B_Vaccination_Coverage', 
                 'Polio_Vaccination_Coverage', 
                 'Diphtheria_Vaccination_Coverage']])

Found 2358 rows where Hepatitis B is null but other vaccinations have values
                    Nation  Survey_Year  Hepatitis_B_Vaccination_Coverage  \
0     Syrian Arab Republic         2006                              83.0   
2               Montenegro         2007                               9.0   
3                  Bahrain         2014                              98.0   
5                     Chad         2008                              17.0   
6          Solomon Islands         2005                              83.0   
...                    ...          ...                               ...   
2932           Saint Lucia         2014                              99.0   
2933                 Malta         2011                              82.0   
2934         Guinea-Bissau         2010                              83.0   
2936                 Italy         2003                              95.0   
2937               Eritrea         2005                              96.0   

##### Clean Measles_Infection_Count Column

In [None]:
(data[data['Measles_Infection_Count']>200])

##### clean Body Mass column

In [110]:
(data[data['Body_Mass_Index_Avg']>40])

Unnamed: 0.1,Unnamed: 0,Nation,Survey_Year,Country_Category,Mortality_Adults,Infant_Deaths_Count,Alcohol_Consumption_Rate,Expenditure_Percentage_GDP,Hepatitis_B_Vaccination_Coverage,Measles_Infection_Count,Body_Mass_Index_Avg,Polio_Vaccination_Coverage,Total_Health_Expenditure,Diphtheria_Vaccination_Coverage,HIV_AIDS_Prevalence_Rate,Gross_Domestic_Product,Total_Population,Thinness,Life_Expectancy_Years
0,0,Syrian Arab Republic,2006,Developing,123.0,8,0.97,6.96,83.0,517,48.5,83.0,3.78,8.0,0.1,1762.246170,18914977.0,6.4,73.7
1,1,Croatia,2006,Developed,113.0,0,11.83,13.69,,1,58.1,96.0,6.95,96.0,0.1,11363.418450,444.0,1.8,75.9
2,2,Montenegro,2007,Developing,125.0,0,4.98,11.39,9.0,0,57.2,92.0,6.74,92.0,0.1,5957.145693,615875.0,2.1,74.2
3,3,Bahrain,2014,Developing,7.0,0,1.57,1.47,98.0,46,62.9,98.0,4.98,98.0,0.1,24983.379200,,6.1,76.8
6,6,Solomon Islands,2005,Developing,24.0,0,0.90,28.48,83.0,0,42.1,9.0,7.83,89.0,0.1,88.874858,469885.0,1.3,67.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2925,2925,Austria,2013,Developed,68.0,0,11.82,16.33,95.0,0,56.6,95.0,11.14,95.0,0.1,554.715320,8479375.0,1.8,81.1
2930,2930,Iraq,2011,Developing,144.0,32,0.17,4.87,77.0,15,56.5,8.0,3.32,79.0,0.1,5854.614497,3172753.0,5.3,77.0
2932,2932,Saint Lucia,2014,Developing,139.0,0,9.97,0.00,99.0,0,46.7,99.0,6.72,99.0,0.1,,,4.3,75.0
2933,2933,Malta,2011,Developed,59.0,0,6.91,15.78,82.0,3,68.0,96.0,9.60,96.0,0.1,22821.847000,416268.0,0.8,87.0


In [None]:
test_sample = data[data['Body_Mass_Index_Avg'] > 40].copy()
print("TEST - What happens if we divide >60 by 2:")
print("=" * 50)

for idx, row in test_sample.iterrows():
    original = row['Body_Mass_Index_Avg']
    corrected = original / 2
    print(f"{row['Nation']:25} : {original:6.1f} → {corrected:5.1f}")

data.loc[data['Body_Mass_Index_Avg'] > 40, 'Body_Mass_Index_Avg'] /= 2

TEST - What happens if we divide >60 by 2:


In [None]:
data[data['Body_Mass_Index_Avg']<10]

### filling null values in columns