In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.model_selection import GridSearchCV
import warnings
warnings.simplefilter('ignore')

In [None]:
df = pd.read_csv("2022.csv")
df

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

In [None]:
def remove_outliers(df, column):
    sorted(df)
    Q1, Q3 = np.percentile(df[column],[25, 75])
    IQR = Q3 - Q1
    lower_range = Q1 - (1.5 * IQR)
    upper_range = Q3 + (1.5 * IQR)
    df = df[(df[column] > lower_range) & (df[column] < upper_range)]
    return df

df = remove_outliers(df, 'GDP per capita')
df = remove_outliers(df, 'Social support')
df = remove_outliers(df, 'Healthy life expectancy')
df = remove_outliers(df, 'Freedom to make life choices')
df = remove_outliers(df, 'Generosity')
df = remove_outliers(df, 'Perceptions of corruption')

# Check if there is any outliers

df.boxplot(figsize=(20, 8))
plt.show()

In [None]:
# Correlation matrix

corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

In [None]:
# Scatter plot

sns.scatterplot(x = 'GDP per capita', y = 'Score', data = df)
plt.show()

sns.scatterplot(x = 'Social support', y = 'Score', data = df)
plt.show()

sns.scatterplot(x = 'Healthy life expectancy', y = 'Score', data = df)
plt.show()

sns.scatterplot(x = 'Freedom to make life choices', y = 'Score', data = df)
plt.show()

sns.scatterplot(x = 'Generosity', y = 'Score', data = df)
plt.show()

sns.scatterplot(x = 'Perceptions of corruption', y = 'Score', data = df)
plt.show()

In [None]:
plt.figure(figsize=(25, 10))
sns.barplot(x='Country or region', y='Score', data=df)
plt.xticks(rotation=90)
plt.show()

In [None]:
# Top 10 countries with the highest happiness score

top10 = df.sort_values(by = 'Score', ascending = False).head(10)
plt.figure(figsize=(20, 10))
sns.barplot(x = 'Country or region', y = 'Score', data = top10)
plt.xticks(rotation = 90)
plt.show()

In [None]:
# Top 10 countries with the highest GDP per capita

top10_gdp = df.sort_values(by = 'GDP per capita', ascending = False).head(10)
plt.figure(figsize=(25, 10))
sns.barplot(x = 'Country or region', y = 'GDP per capita', data = top10_gdp)
plt.xticks(rotation = 90)
plt.show()

In [None]:
# Top 10 countries with the highest social support

top10_social = df.sort_values(by = 'Social support', ascending = False).head(10)
plt.figure(figsize=(25, 10))
sns.barplot(x = 'Country or region', y = 'Social support', data = top10_social)
plt.xticks(rotation = 90)
plt.show()

In [None]:
# Top 10 countries with the highest healthy life expectancy

top10_health = df.sort_values(by = 'Healthy life expectancy', ascending = False).head(10)
plt.figure(figsize=(25, 10))
sns.barplot(x = 'Country or region', y = 'Healthy life expectancy', data = top10_health)
plt.xticks(rotation = 90)
plt.show()

In [23]:
# Splitting the data into features and target
X = df.drop(['Country or region', 'Score'], axis=1)
y = df['Score']

In [24]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [25]:
# Feature Scaling
s = StandardScaler()
X_train = s.fit_transform(X_train)
X_test = s.fit_transform(X_test)

In [None]:
# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
lr_score = lr.score(X_test, y_test)
print('Linear Regression Score: ', lr_score)

In [None]:
# Decision Tree
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
dt_score = dt.score(X_test, y_test)
print('Decision Tree Score: ', dt_score)

In [None]:
# Random Forest
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_score = rf.score(X_test, y_test)
print('Random Forest Score: ', rf_score)

In [None]:
# XGBoost
xgb = XGBRegressor()
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)
xgb_score = xgb.score(X_test, y_test)
print('XGBoost Score: ', xgb_score)

In [None]:
# GradientBoostingRegressor
gb = GradientBoostingRegressor()
gb.fit(X_train, y_train)
gb_pred = gb.predict(X_test)
gb_score = gb.score(X_test, y_test)
print('GradientBoostingRegressor Score: ', gb_score)

In [None]:
# AdaBoostRegressor
ada = AdaBoostRegressor()
ada.fit(X_train, y_train)
ada_pred = ada.predict(X_test)
ada_score = ada.score(X_test, y_test)
print('AdaBoostRegressor Score: ', ada_score)

In [None]:
# KNeighborsRegressor
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
knn_score = knn.score(X_test, y_test)
print('KNeighborsRegressor Score: ', knn_score)

In [None]:
# Linear Regression
print('Linear Regression')
print('Mean Squared Error:', mean_squared_error(y_test, lr_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, lr_pred)))
print('Mean Absolute Error:', mean_absolute_error(y_test, lr_pred))
print('R2 Score:', r2_score(y_test, lr_pred))

In [34]:
# Decision Tree
print('Decision Tree')
print('Mean Squared Error:', mean_squared_error(y_test, dt_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, dt_pred)))
print('Mean Absolute Error:', mean_absolute_error(y_test, dt_pred))
print('R2 Score:', r2_score(y_test, dt_pred))


Decision Tree
Mean Squared Error: 0.041122192307692276
Root Mean Squared Error: 0.20278607523124528
Mean Absolute Error: 0.11980769230769234
R2 Score: 0.9520337302795832


In [None]:
# Random Forest
print('Random Forest')
print('Mean Squared Error:', mean_squared_error(y_test, rf_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, rf_pred)))
print('Mean Absolute Error:', mean_absolute_error(y_test, rf_pred))
print('R2 Score:', r2_score(y_test, rf_pred))

In [None]:
# XGBoost
print('XGBoost')
print('Mean Squared Error:', mean_squared_error(y_test, xgb_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, xgb_pred)))
print('Mean Absolute Error:', mean_absolute_error(y_test, xgb_pred))
print('R2 Score:', r2_score(y_test, xgb_pred))

In [None]:
# GradientBoostingRegressor
print('GradientBoostingRegressor')
print('Mean Squared Error:', mean_squared_error(y_test, gb_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, gb_pred)))
print('Mean Absolute Error:', mean_absolute_error(y_test, gb_pred))
print('R2 Score:', r2_score(y_test, gb_pred))


In [None]:
# AdaBoostRegressor
print('AdaBoostRegressor')
print('Mean Squared Error:', mean_squared_error(y_test, ada_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, ada_pred)))
print('Mean Absolute Error:', mean_absolute_error(y_test, ada_pred))
print('R2 Score:', r2_score(y_test, ada_pred))

In [None]:
# KNeighborsRegressor
print('KNeighborsRegressor')
print('Mean Squared Error:', mean_squared_error(y_test, knn_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, knn_pred)))
print('Mean Absolute Error:', mean_absolute_error(y_test, knn_pred))
print('R2 Score:', r2_score(y_test, knn_pred))

In [None]:
# Random Forest
rf_params = {"max_depth": [3, 5, 8, None],
                "max_features": [3, 5, 10],
                "min_samples_split": [2, 5, 10],
                "min_samples_leaf": [1, 3, 5, 10],
                "bootstrap": [True, False],
                "n_estimators": [100, 500, 1000],
                "random_state": [42]}
rf_model = GridSearchCV(rf, rf_params, cv=5, n_jobs=-1, verbose=2)
rf_model.fit(X_train, y_train)
rf_tuned_score = rf_model.score(X_test,y_test)
print("Best Parameters: " + str(rf_model.best_params_))

In [None]:
# GradientBoostingRegressor
gb_params = {"learning_rate": [0.01, 0.1, 0.05],
                "max_depth": [3, 5, 8],
                "max_features": [3, 5, 10],
                "min_samples_split": [2, 5, 10],
                "min_samples_leaf": [1, 3, 5, 10],
                "n_estimators": [100, 500, 1000],
                "subsample": [1, 0.5, 0.75],
                "random_state": [42]}
gb_cv_model = GridSearchCV(gb, gb_params, cv=5, n_jobs=-1, verbose=2)
gb_cv_model.fit(X_train, y_train)
print("Best Parameters: " + str(gb_cv_model.best_params_))


In [None]:
# AdaBoostRegressor
ada_params = {"learning_rate": [0.01, 0.1, 0.05],
                "n_estimators": [100, 500, 1000],
                "random_state": [42]}
ada_cv_model = GridSearchCV(ada, ada_params, cv=5, n_jobs=-1, verbose=2)
ada_cv_model.fit(X_train, y_train)
print("Best Parameters: " + str(ada_cv_model.best_params_))


In [None]:
# KNeighborsRegressor
knn_params = {"n_neighbors": np.arange(1, 50)}
knn_cv_model = GridSearchCV(knn, knn_params, cv=5, n_jobs=-1, verbose=2)
knn_cv_model.fit(X_train, y_train)
print("Best Parameters: " + str(knn_cv_model.best_params_))

In [None]:
models = [
    RandomForestRegressor(),
    GradientBoostingRegressor(),
    AdaBoostRegressor(),
    KNeighborsRegressor()
]

for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(model)
    print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
    print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
    print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
    print('R2 Score:', r2_score(y_test, y_pred))
  

In [None]:
# Random Forest
rf_tuned = RandomForestRegressor(bootstrap=False, max_depth=8, max_features=5, min_samples_leaf=1, min_samples_split=2, n_estimators=1000, random_state=42)
rf_tuned.fit(X_train, y_train)
rf_tuned_score = rf_tuned.score(X_test,y_test)
rf_tuned_pred = rf_tuned.predict(X_test)
print('Random Forest Tuned')
print('Mean Squared Error:', mean_squared_error(y_test, rf_tuned_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, rf_tuned_pred)))
print('Mean Absolute Error:', mean_absolute_error(y_test, rf_tuned_pred))
print('R2 Score:', r2_score(y_test, rf_tuned_pred))
print('Accuracy:',rf_tuned_score )

# GradientBoostingRegressor
gb_tuned = GradientBoostingRegressor(learning_rate=0.01, max_depth=3, max_features=5, min_samples_leaf=1, min_samples_split=2, n_estimators=1000, random_state=42, subsample=1)
gb_tuned.fit(X_train, y_train)
gb_tuned_score = gb_tuned.score(X_test,y_test)
gb_tuned_pred = gb_tuned.predict(X_test)
print('GradientBoostingRegressor Tuned')
print('Mean Squared Error:', mean_squared_error(y_test, gb_tuned_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, gb_tuned_pred)))
print('Mean Absolute Error:', mean_absolute_error(y_test, gb_tuned_pred))
print('R2 Score:', r2_score(y_test, gb_tuned_pred))
print('Accuracy:',gb_tuned_score )

# AdaBoostRegressor
ada_tuned = AdaBoostRegressor(learning_rate=0.01, n_estimators=500, random_state=42)
ada_tuned.fit(X_train, y_train)
ada_tuned_score = ada_tuned.score(X_test,y_test)
ada_tuned_pred = ada_tuned.predict(X_test)
print('AdaBoostRegressor Tuned')
print('Mean Squared Error:', mean_squared_error(y_test, ada_tuned_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, ada_tuned_pred)))
print('Mean Absolute Error:', mean_absolute_error(y_test, ada_tuned_pred))
print('R2 Score:', r2_score(y_test, ada_tuned_pred))
print('Accuracy:',gb_tuned_score )



In [None]:
# KNeighborsRegressor
knn_tuned = KNeighborsRegressor(n_neighbors=7)
knn_tuned.fit(X_train, y_train)
knn_tuned_score =knn_tuned.score(X_test,y_test)
knn_tuned_pred = knn_tuned.predict(X_test)
print('KNeighborsRegressor Tuned')
print('Mean Squared Error:', mean_squared_error(y_test, knn_tuned_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, knn_tuned_pred)))
print('Mean Absolute Error:', mean_absolute_error(y_test, knn_tuned_pred))
print('R2 Score:', r2_score(y_test, knn_tuned_pred))
print('Accuracy:',knn_tuned_score )

In [45]:
dictionary = {
    "Models":["RandomForestRegressor()","GradientBoostingRegressor()","AdaBoostRegressor()","KNeighborsRegressor()"],
    "old_score":[rf_score,gb_score,ada_score,knn_score],
    "new_score":[rf_tuned_score,gb_tuned_score,ada_tuned_score,knn_tuned_score]
}

In [None]:
df = pd.DataFrame(dictionary)
df

In [None]:
# Random Forest
rf_tuned_pred = rf_tuned.predict(X_test)
plt.figure(figsize=(10, 5))
plt.scatter(y_test, rf_tuned_pred, color='red')
plt.plot(y_test, y_test, color='blue')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Random Forest Tuned')
plt.show()