In [1]:
# !pip install catboost
# !pip install xgboost

In [2]:
import numpy as np
import pandas as pd
import lightgbm

df = pd.read_csv('LifeExpectancyData.csv')
df = df [['Country','Year','Status','Life expectancy']]

In [3]:
df.shape

(2938, 4)

In [4]:
df.describe()

Unnamed: 0,Year,Life expectancy
count,2938.0,2928.0
mean,2007.51872,69.224932
std,4.613841,9.523867
min,2000.0,36.3
25%,2004.0,63.1
50%,2008.0,72.1
75%,2012.0,75.7
max,2015.0,89.0


In [5]:
df.head()

Unnamed: 0,Country,Year,Status,Life expectancy
0,Afghanistan,2015,Developing,65.0
1,Afghanistan,2014,Developing,59.9
2,Afghanistan,2013,Developing,59.9
3,Afghanistan,2012,Developing,59.5
4,Afghanistan,2011,Developing,59.2


In [6]:
# check the status of the first 5 rows for developing countries life expectancy
df[df["Status"] == "Developing"].head().sort_index()

Unnamed: 0,Country,Year,Status,Life expectancy
0,Afghanistan,2015,Developing,65.0
1,Afghanistan,2014,Developing,59.9
2,Afghanistan,2013,Developing,59.9
3,Afghanistan,2012,Developing,59.5
4,Afghanistan,2011,Developing,59.2


In [7]:
# Check for NaN values using isnull()
nan_df = df.isnull()

# Check for NaN values using isna() (alternative to isnull())
nan_df = df.isna()

# Count the number of NaN values in each column
nan_counts = nan_df.sum()

# Check if any NaN values exist in the DataFrame
any_nan = nan_df.any().any()

# Display the DataFrame of NaN values
print(nan_df)

# Display the count of NaN values in each column
print(nan_counts)

# Display if any NaN values exist in the DataFrame
print(any_nan)

      Country   Year  Status  Life expectancy
0       False  False   False            False
1       False  False   False            False
2       False  False   False            False
3       False  False   False            False
4       False  False   False            False
...       ...    ...     ...              ...
2933    False  False   False            False
2934    False  False   False            False
2935    False  False   False            False
2936    False  False   False            False
2937    False  False   False            False

[2938 rows x 4 columns]
Country             0
Year                0
Status              0
Life expectancy    10
dtype: int64
True


In [8]:
# Calculate the mean value of the 'Life expectancy' column
mean_value = df['Life expectancy'].mean()
print(mean_value)
# Replace NaN values in the 'Life expectancy' column with the mean value
df['Life expectancy'].fillna(mean_value, inplace=True)

69.22493169398908


In [9]:
from catboost import CatBoostRegressor, Pool

# separating dependent and independent variables
X = df.drop(columns='Life expectancy')
y = df['Life expectancy']


In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

pool_train = Pool(X_train, y_train,
                  cat_features = ['Country','Year','Status'])

pool_test = Pool(X_test, cat_features = ['Country','Year','Status'])

In [11]:
#CatBoost

import time

start = time.time()

cbr = CatBoostRegressor(iterations=100,max_depth=2)

cbr.fit(pool_train, verbose=0)
y_pred = cbr.predict(X_test)
print(y_pred)

end = time.time()

print(end - start)


[76.31329318 61.72198341 58.03753831 61.72198341 56.80452727 81.00392941
 59.94671888 74.90687338 73.52453245 74.90687338 73.97812167 81.19683979
 56.80452727 60.96839823 72.76448243 61.56149929 75.26188762 71.02266052
 59.35361782 57.24737427 71.75527062 69.7160478  69.50757391 61.62764802
 77.54863237 79.07819412 73.39014376 69.50966429 61.56149929 58.83105371
 72.02477594 61.56149929 74.90687338 72.2252273  81.0982648  75.25727743
 61.45015369 81.1025044  75.56745625 72.0496069  77.19228614 81.19683979
 73.27134192 57.28395313 69.50966429 74.33022831 60.96839823 71.28770959
 77.11898904 56.28196316 57.33704643 74.33022831 74.21145996 60.87406285
 69.41323852 61.62764802 60.87406285 73.41501482 60.96839823 70.53099407
 56.71485511 57.24737427 61.56149929 67.3066744  75.67366079 77.64296776
 74.1176862  58.83105371 77.11898904 59.94671888 81.00392941 81.19683979
 66.50326688 79.99741363 71.90802286 76.74293651 69.24667499 76.74293651
 77.11898904 80.64891518 57.24737427 75.17221546 68

In [12]:
from sklearn.metrics import r2_score as RSquared

cb_rsquared = np.sqrt(RSquared(y_test, y_pred))
print("R Squared for CatBoost: ", np.mean(cb_rsquared))

end = time.time()
diff = end - start
print('Execution time:', diff)

R Squared for CatBoost:  0.8804602153710083
Execution time: 0.13175725936889648


In [13]:
#XGBoost
import xgboost as xgb
from sklearn import preprocessing

X = df.drop(columns='Life expectancy')
y = df['Life expectancy']

lbl = preprocessing.LabelEncoder()
#Country','Year','Status
X['Country'] = lbl.fit_transform(X['Country'].astype(str))
X['Year'] = lbl.fit_transform(X['Year'].astype(str))
X['Status'] = lbl.fit_transform(X['Status'].astype(str))

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, 
                                                    random_state=1)

start = time.time()
#X_train["Species"].astype("category")
xgbr = xgb.XGBRegressor()

xgbr.fit(X_train, y_train)
y_pred = xgbr.predict(X_test)

xgb_rmse = np.sqrt(RSquared(y_test, y_pred))
print("R Squared for XGBoost: ", np.mean(xgb_rmse))

end = time.time()
diff = end - start
print('Execution time:', diff)

R Squared for XGBoost:  0.9749184122015732
Execution time: 0.10914993286132812


In [15]:
# import lightgbm

X = df.drop(columns='Life expectancy')
y = df['Life expectancy']
obj_feat = list(X.loc[:, X.dtypes == 'object'].columns.values)

for feature in obj_feat:
    X[feature] = pd.Series(X[feature], dtype="category")

start = time.time()

lgbmr = lightgbm.LGBMRegressor()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, 
                                                    random_state=1)
print(lightgbm)
# below code crashes jupyter kernel
# lightgbm.LGBMRegressor(verbose=1).fit(X_train, y_train)
# y_pred = lgbmr.predict(X_test)

# lgbm_rsquared = np.sqrt(RSquared(y_test, y_pred))
# print()
# print("R squared for LightGBM: ", np.mean(lgbm_rsquared))

# end = time.time()
# diff = end - start
# print('Execution time:', diff)

<module 'lightgbm' from '/opt/homebrew/Caskroom/miniconda/base/envs/py38/lib/python3.8/site-packages/lightgbm/__init__.py'>


### 1. Community support/Performance/Want to tune - XGBOOST
### 2. Hurry to train/Performance/Dont want to tune - LightGBM
### 3. More categorical, GPU, Large data - CatBoost