In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, precision_score, recall_score, f1_score, accuracy_score
from category_encoders import TargetEncoder

# Load the dataset
df = pd.read_csv('led.csv')

print(sklearn.__version__)
# Handling missing values
# Fill missing numeric values with the mean
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Fill missing categorical values with the mode
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Encode categorical features using Target Encoder
categorical_features = ['Country']
encoder = TargetEncoder(cols=categorical_features)
df[categorical_features] = encoder.fit_transform(df[categorical_features], df['Lifeexpectancy'])

# Define the important features
important_rf_features = ['Country', 'Year', 'AdultMortality', 'Incomecompositionofresources', 'Schooling', 'HIV/AIDS', 'Totalexpenditure', 'Population', 'BMI', 'Measles']

# Define the feature columns and target column
X = df[important_rf_features]
y = df['Lifeexpectancy']


# Split the data into training and testing sets for both models
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Regression using all features
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)



# Function to predict life expectancy based on input data
def predict_life_expectancy(model, input_data, important_features=None):
    if important_features:
        input_data = input_data[important_features]
    return model.predict(input_data)

# Example usage
example_data = {
#     'Country': ['Afghanistan'],
#     'Year': [2015],  # Example year value
#     'Status': ['Developed'],
#     'GDP': [40000],  # Example GDP value
#     'Schooling': [15],  # Example schooling years
#     'Incomecompositionofresources': [0.8]  # Example value
    'Country': ['Afghanistan'], 
    'Year': [2015], 
    'AdultMortality': [263], 
    'Incomecompositionofresources': [0.479], 
    'Schooling': [10.0], 
    'HIV/AIDS': [0.1], 
    'Totalexpenditure': [8.61], 
    'Population': [25921], 
    'BMI': [19.1], 
    'Measles': [1154]
}


input_data = pd.DataFrame(example_data)
input_data[categorical_features] = encoder.transform(input_data[categorical_features])


prediction = predict_life_expectancy(rf_model, input_data, important_rf_features)
chosen_model = "Random Forest (Important Features)"
    
# # Predict using the chosen model
# if r2_score(y_test, y_pred_rf) > r2_score(y_important_test, y_pred_lr):
#     prediction = predict_life_expectancy(rf_model_important, input_data, important_rf_features)
#     chosen_model = "Random Forest (Important Features)"
# else:
#     prediction = predict_life_expectancy(lr_model, input_data, important_features)
#     chosen_model = "Linear Regression"

print(f"The chosen model for deployment is: {chosen_model}")
print(f"Predicted Life Expectancy: {prediction[0]}")


1.4.2


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


The chosen model for deployment is: Random Forest (Important Features)
Predicted Life Expectancy: 61.743999999999986


In [2]:
import joblib

# Save the models
# joblib.dump(lr_model, 'lr_model.pkl')
joblib.dump(rf_model, 'rf_model.pkl')
joblib.dump(encoder, 'encoder.pkl')

['encoder.pkl']