In [1]:
%load_ext lab_black

In [2]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

In [3]:
df = pd.read_csv("../Resources/lagged_data.csv").assign(
    country_id=lambda df: df["country_id"].replace(
        {
            1: "United Kingdom",
            2: "Austria",
            3: "Belgium",
            4: "Denmark",
            5: "Finland",
            6: "France",
            7: "Germany",
            8: "Ireland",
            9: "Italy",
            10: "Luxembourg",
            11: "Netherlands",
            12: "Norway",
            13: "Portugal",
            14: "Russia",
            15: "Spain",
            16: "Sweden",
            17: "Switzerland",
            18: "Greece",
            19: "Hungary",
            20: "Turkey",
            21: "Canada",
            22: "Australia",
            23: "New Zealand",
            24: "Japan",
            25: "China",
            26: "Hong Kong",
            27: "India",
            28: "Malaysia",
            29: "Philippines",
            30: "Singapore",
            31: "South Korea",
            32: "Israel",
            33: "Indonesia",
            34: "Pakistan",
            35: "Thailand",
            36: "Kuwait",
            37: "United Arab Emirates",
            38: "Argentina",
            39: "Brazil",
            40: "Chile",
            41: "Mexico",
            42: "Panama",
            43: "Venezuela",
            44: "South Africa",
            45: "Liberia",
        }
    ),
    region_id=lambda df: df["region_id"].replace(
        {1: "Europe", 2: "Canada & Pacific", 3: "Asia", 4: "Latin America", 5: "Africa"}
    ),
)

In [4]:
df

Unnamed: 0,year,year_id,countries,country_id,regions,region_id,western_emerging,west_emerge_id,capital,latitude,...,fdi_in_usa_million,globalization_100,gdp_per_capita_usd,interaction_gdp,fdi_by_usa_million,interaction_us_fdi,stock_market_capitalization_gdp,interaction_stock_mkt,government_effectiveness,rule_of_law
0,1984,1,United Kingdom,United Kingdom,Europe,Europe,Western,1,London,51.5085,...,38387.0,78.0,8918.798085,695666.25060,27811.5,2169297.0,42.114250,3284.911500,1.88,1.630
1,1985,2,United Kingdom,United Kingdom,Europe,Europe,Western,1,London,51.5085,...,43555.0,78.0,8435.356627,657957.81690,28675.5,2236689.0,48.675750,3796.708500,1.88,1.630
2,1986,3,United Kingdom,United Kingdom,Europe,Europe,Western,1,London,51.5085,...,55935.0,78.0,8415.705492,656425.02840,31665.5,2469909.0,61.734800,4815.314400,1.88,1.630
3,1987,4,United Kingdom,United Kingdom,Europe,Europe,Western,1,London,51.5085,...,75519.0,78.0,9631.664376,751269.82140,35520.0,2770560.0,75.434450,5883.887100,1.88,1.630
4,1988,5,United Kingdom,United Kingdom,Europe,Europe,Western,1,London,51.5085,...,95698.0,78.5,11864.849370,932017.54420,41731.5,3278301.5,84.921100,6669.454050,1.88,1.630
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1615,2015,32,Liberia,Liberia,Africa,Africa,Emerging,2,Monrovia,6.3005,...,501.0,48.5,721.556425,34995.67343,920.5,44682.0,0.421935,20.462885,-1.36,-0.845
1616,2016,33,Liberia,Liberia,Africa,Africa,Emerging,2,Monrovia,6.3005,...,479.0,47.5,715.783326,34002.40773,925.5,43921.0,0.423860,20.133350,-1.35,-0.835
1617,2017,34,Liberia,Liberia,Africa,Africa,Emerging,2,Monrovia,6.3005,...,457.0,47.0,712.503625,33487.67037,881.0,41407.0,0.423860,19.921420,-1.32,-0.905
1618,2018,35,Liberia,Liberia,Africa,Africa,Emerging,2,Monrovia,6.3005,...,466.0,47.5,706.662578,33562.49206,677.0,32118.0,0.423860,20.133350,-1.31,-0.950


In [5]:
target = "fdi_in_usa_million"

In [6]:
X = df.drop(
    [
        "fdi_in_usa_million",
        "year_id",
        "countries",
        "regions",
        "western_emerging",
        "west_emerge_id",
        "capital",
        "latitude",
        "longitude",
        "interaction_gdp",
        "interaction_us_fdi",
        "interaction_stock_mkt",
    ],
    axis=1,
)
y = df[target].values

In [7]:
X.columns

Index(['year', 'country_id', 'region_id', 'globalization_100',
       'gdp_per_capita_usd', 'fdi_by_usa_million',
       'stock_market_capitalization_gdp', 'government_effectiveness ',
       'rule_of_law'],
      dtype='object')

In [8]:
cf = ColumnTransformer(
    [
        (
            "numerical",
            "passthrough",
            [
                "year",
                "globalization_100",
                "gdp_per_capita_usd",
                "fdi_by_usa_million",
                "stock_market_capitalization_gdp",
                "government_effectiveness ",
                "rule_of_law",
            ],
        ),
        ("categorical", OneHotEncoder(drop="first"), ["country_id", "region_id"]),
    ]
)

In [9]:
lr_pipeline = make_pipeline(cf, LinearRegression())
interaction__lr_pipeline = make_pipeline(
    cf, PolynomialFeatures(interaction_only=True), LinearRegression()
)
decision_tree_pipeline = make_pipeline(cf, DecisionTreeRegressor())
random_forest_pipeline = make_pipeline(cf, RandomForestRegressor())

In [10]:
k_fold = KFold(n_splits=6, shuffle=True, random_state=42)

In [11]:
cross_validate(lr_pipeline, X, y, scoring="neg_mean_squared_error", cv=k_fold)[
    "test_score"
].mean()

-1132559799.8247373

In [12]:
cross_validate(
    interaction__lr_pipeline, X, y, scoring="neg_mean_squared_error", cv=k_fold
)["test_score"].mean()

-386366385.0258911

In [13]:
cross_validate(
    decision_tree_pipeline, X, y, scoring="neg_mean_squared_error", cv=k_fold
)["test_score"].mean()

-762876152.1077656

In [14]:
cross_validate(
    random_forest_pipeline, X, y, scoring="neg_mean_squared_error", cv=k_fold
)["test_score"].mean()

-302191507.1667724

In [15]:
# rf -258315420.98629725
# Boxplot for outliers
# Multicollinearity of Xs
# AOC Curve
# OLS
# SHAP value and graph

In [16]:
# Model running and testing 1: Linear Multiple Regression (number of jobs)
# Model running and testing 2: Linear Multiple Regression with Interaction (number of jobs)
# Model running and testing 3: Decision Tree (parameters)
# Model running and testing 4: Random Forest (parameters)

In [17]:
# Best model with full data 1982-2019
# Try best model with lagged data by 1 yr
# Try best model with lagged data by 2 yr avg