In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

In [None]:
file_path = ".csv"
df_gdp = pd.read_csv(file_path, delimiter="\,")  

print(df_gdp.head())

In [None]:
file_path = ".csv"
df = pd.read_csv(file_path, delimiter="\,")  

print(df.head())

In [None]:

label_encoder = LabelEncoder()
df['sex'] = label_encoder.fit_transform(df['sex'])  # F -> 0, M -> 1
df['profession'] = label_encoder.fit_transform(df['profession'])

# Selecting features (X) and target (y)
X = df[['year', 'sex', 'profession', 'income']]
y = df['gdp']

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model: Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


# Predict GDP for the next 5 years
years_to_predict = np.array([1, 2, 3, 4, 5]).reshape(-1, 1)  # Next 5 years

predicted_gdp = model.predict(np.hstack([years_to_predict, np.full((5, 1), 1), np.full((5, 1), 1), np.full((5, 1), 1000)]))  # Example values for 'sex', 'profession', and 'income'

# Output predictions
for year, gdp in zip(range(1, 6), predicted_gdp):
    print(f"Predicted GDP for Year {year}: {gdp:.2f}")

# Feature Importance to see which factors contribute most to GDP prediction
feature_importances = model.feature_importances_
feature_names = X.columns

# Plotting feature importance
plt.barh(feature_names, feature_importances)
plt.xlabel("Feature Importance")
plt.title("Feature Importance in Predicting GDP")
plt.show()