Get data from file

In [80]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
import matplotlib.pyplot as plt

# Đọc dữ liệu từ tập tin CSV
data = pd.read_csv("Life Expectancy Data.csv")

Fill nan data and normalize with SimpleInputer( mean value and most frequent value)

In [81]:
def imputing_data(data):
    numeric_columns = data.select_dtypes(include='number').columns
    categorical_columns = data.select_dtypes(exclude='number').columns

    imputer_numeric = SimpleImputer(strategy='mean')
    imputer_categorical = SimpleImputer(strategy='most_frequent')

    data[numeric_columns] = imputer_numeric.fit_transform(data[numeric_columns])
    data[categorical_columns] = imputer_categorical.fit_transform(data[categorical_columns])

    label_encoder = LabelEncoder()
    for column in categorical_columns:
        if data[column].dtype == 'object':
            data[column] = data[column].fillna('Unknown')  # Replace missing values with 'Unknown'
        data[column] = label_encoder.fit_transform(data[column])

    data[numeric_columns] = data[numeric_columns].round(1)

    return data

imputing_data(data)
display(data)

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,0,2015.0,1,65.0,263.0,62.0,0.0,71.3,65.0,1154.0,...,6.0,8.2,65.0,0.1,584.3,33736494.0,17.2,17.3,0.5,10.1
1,0,2014.0,1,59.9,271.0,64.0,0.0,73.5,62.0,492.0,...,58.0,8.2,62.0,0.1,612.7,327582.0,17.5,17.5,0.5,10.0
2,0,2013.0,1,59.9,268.0,66.0,0.0,73.2,64.0,430.0,...,62.0,8.1,64.0,0.1,631.7,31731688.0,17.7,17.7,0.5,9.9
3,0,2012.0,1,59.5,272.0,69.0,0.0,78.2,67.0,2787.0,...,67.0,8.5,67.0,0.1,670.0,3696958.0,17.9,18.0,0.5,9.8
4,0,2011.0,1,59.2,275.0,71.0,0.0,7.1,68.0,3013.0,...,68.0,7.9,68.0,0.1,63.5,2978599.0,18.2,18.2,0.5,9.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2933,192,2004.0,1,44.3,723.0,27.0,4.4,0.0,68.0,31.0,...,67.0,7.1,65.0,33.6,454.4,12777511.0,9.4,9.4,0.4,9.2
2934,192,2003.0,1,44.5,715.0,26.0,4.1,0.0,7.0,998.0,...,7.0,6.5,68.0,36.7,453.4,12633897.0,9.8,9.9,0.4,9.5
2935,192,2002.0,1,44.8,73.0,25.0,4.4,0.0,73.0,304.0,...,73.0,6.5,71.0,39.8,57.3,125525.0,1.2,1.3,0.4,10.0
2936,192,2001.0,1,45.3,686.0,25.0,1.7,0.0,76.0,529.0,...,76.0,6.2,75.0,42.1,548.6,12366165.0,1.6,1.7,0.4,9.8


Linear regression:

In [82]:
# Tạo X và y cho mô hình Linear Regression
X = data.drop(["Life expectancy "], axis=1)  # Exclude "Year" column
y = data["Life expectancy "]

# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Huấn luyện mô hình Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)

# Dự đoán tuổi thọ trên tập kiểm tra
y_pred = model.predict(X_test)


MAE, MSE, R^2

In [83]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("R^2 Score:", r2)


Mean Absolute Error (MAE): 2.8450306919914943
Mean Squared Error (MSE): 15.113175483960552
R^2 Score: 0.8255539566536538


cross-validation

In [84]:
cv_scores = cross_val_score(model, X, y, cv=5)

print("Cross-Validation Scores:", cv_scores)
print("Average Cross-Validation Score:", cv_scores.mean())


Cross-Validation Scores: [0.81425793 0.80479476 0.79705785 0.7076973  0.82010344]
Average Cross-Validation Score: 0.7887822580077053


predict lifespan to 2030

In [85]:
countries = data["Country"].unique()

for country in countries:
    print("Country:", country)

    country_data = data[data["Country"] == country]

    # Tạo dữ liệu dự đoán từ 2016 đến 2030
    years = list(range(2016, 2031))
    last_row_data = country_data.iloc[-1].drop("Life expectancy ")  # Exclude "Life expectancy " column
    replicated_data = pd.DataFrame([last_row_data.values.tolist()] * (len(years) - 1), columns=X.columns)
    X_future_country = pd.DataFrame({'Year': years})

    # Merge Year column and replicated data
    X_future_country = pd.concat([X_future_country, replicated_data], axis=1)

    # Ensure the columns are in the correct order and have the same data types
    X_future_country = X_future_country[X.columns].astype(X.dtypes.to_dict())

    # Dự đoán tuổi thọ từ năm 2016 đến 2030
    y_pred_country = model.predict(X_future_country)

    # Biểu đồ dự đoán tuổi thọ của quốc gia hiện tại từ 2015 đến 2030
    plt.figure(figsize=(10, 6))
    plt.plot(country_data["Year"], country_data["Life expectancy "], label="Actual")
    plt.plot(years, y_pred_country, label="Predicted")
    plt.xlabel("Year")
    plt.ylabel("Life Expectancy")
    plt.title(f"Life Expectancy in {country} (2015 - 2030)")
    plt.legend()
    plt.grid(True)
    plt.show()

    print("\n")

Country: 0


IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer: Error while type casting for column 'Country'