In [112]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [113]:
import warnings
warnings.filterwarnings('ignore')

In [114]:
df = pd.read_excel("WEOOct2020all.xlsx")

for year in range(1980, 2026):
    df[year] = pd.to_numeric(df[year].str.replace(',', ''), errors='coerce').fillna(0)


FileNotFoundError: [Errno 2] No such file or directory: 'WEOOct2020all.xlsx'

In [None]:
df.head()

### Top 10 countries that grew "Gross domestic product per capita" the most over the last decade

In [None]:
gdp_data = df[df["WEO Subject Code"] == "NGDPRPPPPC"]
gdp_data

In [None]:
current_time = gdp_data[2023]
ten_years_ago = gdp_data[2024]

gdp_data['Growth'] = current_time - ten_years_ago

In [None]:
top10 = gdp_data.sort_values(by='Growth', ascending=False).head(10)

In [None]:
top10

### OECD countries' "Population" growth over the last decade

In [None]:
oecd_countries_iso_codes = [
    'AUS', 'AUT', 'BEL', 'CAN', 'CHL', 'COL', 'CZE', 'DNK', 'EST', 'FIN',
    'FRA', 'DEU', 'GRC', 'HUN', 'ISL', 'IRL', 'ISR', 'ITA', 'JPN', 'KOR',
    'LVA', 'LTU', 'LUX', 'MEX', 'NLD', 'NZL', 'NOR', 'POL', 'PRT', 'SVK',
    'SVN', 'ESP', 'SWE', 'CHE', 'TUR', 'GBR', 'USA',
]

In [None]:
oecd_df = df[(df['ISO'].isin(oecd_countries_iso_codes)) & (df["Subject Descriptor"] == "Population")]

In [None]:
columns_to_keep = ['Country', 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]
oecd_population_df = oecd_df[columns_to_keep]

In [None]:
oecd_population_df.set_index('Country', inplace=True)
oecd_population_df_T = oecd_population_df.transpose()

In [None]:
plt.figure(figsize=(12, 8))
oecd_population_df_T.plot(kind='line', marker='o')
plt.title("OECD Countries' Population Growth Over the Last Decade")
plt.xlabel("Year")
plt.ylabel("Population")
plt.legend(title='Country', bbox_to_anchor=(1, 1))
plt.show()


### GDP growth figures in separate charts and save them as PNG files

Save the GDP growth figures in separate charts and save them as PNG files


In [None]:
top10_filtered = top10[columns_to_keep]

top10_filtered

In [None]:
for index, row in top10_filtered.iterrows():
    country = row['Country']
    years = [int(year) for year in top10_filtered.columns[1:]]
    gdp_values = row[1:].values

    plt.plot(years, gdp_values, marker='o')
    plt.title(f'GDP Growth Over Years - {country}')
    plt.xlabel('Year')
    plt.ylabel('GDP')
    plt.grid(True)
    
    # plt.savefig(f'{country}_GDP_Growth.png')
    
    plt.savefig(f'photos/{country}_GDP_Growth.png')

    plt.clf()

print("Saved successfully.")


### Task 4

In [None]:
gdp_data = df[df['WEO Subject Code'] == 'NGDPRPPPPC'].drop(columns=['Subject Descriptor'])
export_data = df[df['WEO Subject Code'] == 'TXG_RPCH'].drop(columns=['Subject Descriptor'])

In [None]:
# Combine GDP and export data
columns_to_keep = ['Country']
for year in range(1980, 2026):
    columns_to_keep.append(f"{year}_gdp")
    columns_to_keep.append(f"{year}_export")
combined_data = pd.merge(gdp_data, export_data, on='Country', suffixes=('_gdp', '_export'))
combined_data = combined_data[columns_to_keep]
combined_data

In [None]:
# K-means clustering
X = combined_data[columns_to_keep[1:]]
kmeans = KMeans(n_clusters=5, random_state=42)
combined_data['cluster'] = kmeans.fit_predict(X)

# Plot the clusters and label top 5 countries based on GDP
plt.figure(figsize=(20, 16))
for i in range(5):
    cluster_data = combined_data[combined_data['cluster'] == i]
    plt.scatter(cluster_data['2023_gdp'], cluster_data['2023_export'], label=f'Cluster {i + 1}')

# Add labels for the top 5 countries based on GDP in each cluster
for i in range(5):
    cluster_data = combined_data[combined_data['cluster'] == i]
    top_countries = cluster_data.nlargest(5, '2018_gdp')
    for _, country in top_countries.iterrows():
        plt.annotate(country['Country'], (country['2023_gdp'], country['2023_export']))

plt.xlabel('GDP')
plt.ylabel('Volume of Exports')
plt.title('Clusters of Countries based on GDP and Volume of Exports')
plt.legend()
plt.show()


### Task 5

In [None]:
result = df[df[2015].notna()].groupby('Subject Descriptor')[2015].count().sort_values(ascending=False)
result

### Task 6

In [None]:
!pip install pycountry-convert

In [None]:
import pycountry_convert as pc

def country_to_continent(country_name):
    try:
        country_alpha2 = pc.country_name_to_country_alpha2(country_name)
        country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
        country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
        return country_continent_name
    except:
        return ""

In [None]:
gdp_data = df.copy()

# clear all gdp related rows related to GDP except NGDPRPPPPC
exclude = [
    "PPPPC",
    "NGDPDPC",
    "NGDPPC",
    "NGDPRPC",
    "NGDP_D",
    "PPPGDP",
    "NGDPD",
    "NGDP",
    "NGDP_RPCH",
    "NGDP_R",
    "PPPSH"
]
gdp_data = gdp_data[~gdp_data["WEO Subject Code"].isin(exclude)].drop(
    columns=[
        "WEO Country Code",
        "ISO",
        "Subject Notes",
        "Subject Descriptor",
        "Units",
        "Scale",
        "Country/Series-specific Notes",
        "Estimates Start After",
    ]
)

gdp_data.head()

In [None]:
melted_df = pd.melt(gdp_data, id_vars=['Country', 'WEO Subject Code'], var_name='year', value_name='value')

gdp_data = melted_df.pivot_table(index=['year', 'Country'], columns='WEO Subject Code', values='value').reset_index()

gdp_data = gdp_data.rename_axis(columns=None).reset_index()
gdp_data.drop(columns='index', inplace=True)

In [None]:
gdp_data["Continent"] = gdp_data["Country"].apply(country_to_continent)
gdp_data.drop(columns="Country", inplace=True)

In [None]:
# X contains the features, y contains the target variable
features = gdp_data.drop(["NGDPRPPPPC"], axis=1)
target = gdp_data["NGDPRPPPPC"]

features_encoded = pd.get_dummies(features)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    features_encoded, target, test_size=0.2, random_state=42
)

In [None]:
# # Linear Regression

# model = LinearRegression()

# model.fit(X_train, y_train)

# predictions = model.predict(X_test)

# mse = mean_squared_error(y_test, predictions)
# print(f"Mean Squared Error: {mse}")

In [None]:
# Random Forest 

model = RandomForestRegressor(max_depth=30)

model.fit(X_train, y_train)

predictions = model.predict(X_test)

mse = mean_squared_error(y_test, predictions)
print(f"Mean Squared Error: {mse}")

In [None]:
print(list(zip(predictions, y_test))[:10])

In [None]:
feature_importances = model.feature_importances_

# Create a DataFrame to display the feature importances
feature_importance_df = pd.DataFrame(
    {"Feature": features_encoded.columns, "Importance": feature_importances}
)

# print(feature_importance_df)

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(
    by="Importance", ascending=False
)

# Display the top 5 features
top_5_features = feature_importance_df.head(5)
top_5_features = top_5_features.map(lambda x: "Continent" if type(x) == str and x.startswith("Continent") else x)
top_5_features

### Train another predictor that uses those top 5 features

In [None]:
selected_features = top_5_features["Feature"].tolist() + ["NGDPRPPPPC"]

new_gdp_data = gdp_data[selected_features]

features = new_gdp_data[selected_features].drop(["NGDPRPPPPC"], axis=1)
target = new_gdp_data["NGDPRPPPPC"]

features_encoded = pd.get_dummies(features)

X_train, X_test, y_train, y_test = train_test_split(
    features_encoded, target, test_size=0.2, random_state=42
)

model = RandomForestRegressor()

model.fit(X_train, y_train)

predictions = model.predict(X_test)

mse = mean_squared_error(y_test, predictions)
print(f"Mean Squared Error: {mse}")

In [None]:
features

In [None]:
print(list(zip(predictions, y_test))[:10])

In [None]:
# export the model to a file
import pickle

with open("model.pkl", "wb") as f:
    pickle.dump(model, f)