In [32]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt

In [33]:
df = pd.read_csv('idb5yr.txt', delimiter="|")
df2 = pd.read_csv("limits_final/joined_data/ru_joined.csv")

In [34]:
df = df.dropna()

In [37]:
print(df.shape)
print(df2.shape)

(26060, 115)
(64, 69)


In [15]:
train_df = df[df["#YR"] <= 2023]

In [28]:
things_to_add = ["POP", "BIRTH", "DEATH"]

def add_previous_years(df):
    target_map = df['POP'].to_dict()
    df['lag1'] = (df.index - 1).map(target_map)
    df['lag2'] = (df.index - 2).map(target_map)
    df['lag3'] = (df.index - 3).map(target_map)
    df["lag1"].fillna(0, inplace=True)
    df["lag2"].fillna(0, inplace=True)
    df["lag3"].fillna(0, inplace=True)
    return df


def add_next_years(df):
    target_map = df['POP'].to_dict()
    df['lag1'] = (df.index + 1).map(target_map)
    df['lag2'] = (df.index + 2).map(target_map)
    df['lag3'] = (df.index + 3).map(target_map)
    df["goal"] = (df.index - 1).map(target_map)
    df["lag1"].fillna(0, inplace=True)
    df["lag2"].fillna(0, inplace=True)
    df["lag3"].fillna(0, inplace=True)
    df["goal"].fillna(0, inplace=True)
    return df


In [17]:
grouped = df.groupby('GEO_ID', as_index=False)
fixed = grouped.apply(add_previous_years)

In [18]:
new_frame = fixed.reset_index()
new_frame = new_frame.drop(["level_0", "level_1"], axis=1)

In [19]:
print(new_frame)

        #YR       GEO_ID  AREA_KM2  ASFR15_19  ASFR20_24  ASFR25_29  \
0      1990  W140000WOAD     468.0        6.1       37.0       88.8   
1      1991  W140000WOAD     468.0        6.5       37.3       90.6   
2      1992  W140000WOAD     468.0        6.7       37.5       92.3   
3      1993  W140000WOAD     468.0        6.4       35.0       87.0   
4      1994  W140000WOAD     468.0        6.2       33.3       83.0   
...     ...          ...       ...        ...        ...        ...   
26055  2096  W140000WOZW  386847.0       17.8       64.2      102.9   
26056  2097  W140000WOZW  386847.0       17.1       63.1      102.2   
26057  2098  W140000WOZW  386847.0       16.4       62.1      101.5   
26058  2099  W140000WOZW  386847.0       15.7       61.0      100.7   
26059  2100  W140000WOZW  386847.0       15.0       60.0      100.0   

       ASFR30_34  ASFR35_39  ASFR40_44  ASFR45_49  ...  MEDAGE  MEDAGE_M  \
0           85.6       35.3        8.5        0.7  ...    32.8      33.

In [20]:
no_country_df = new_frame.drop(["GEO_ID"], axis=1)

In [21]:
valid_df = no_country_df[no_country_df["#YR"] <= 2023]

In [22]:
print(valid_df.head())

    #YR  AREA_KM2  ASFR15_19  ASFR20_24  ASFR25_29  ASFR30_34  ASFR35_39  \
0  1990     468.0        6.1       37.0       88.8       85.6       35.3   
1  1991     468.0        6.5       37.3       90.6       87.2       35.8   
2  1992     468.0        6.7       37.5       92.3       88.7       36.1   
3  1993     468.0        6.4       35.0       87.0       83.8       34.0   
4  1994     468.0        6.2       33.3       83.0       79.9       32.6   

   ASFR40_44  ASFR45_49    CBR  ...  MEDAGE  MEDAGE_M  MEDAGE_F  SEXRATIO  \
0        8.5        0.7  11.91  ...    32.8      33.0      32.6     1.120   
1        8.4        0.7  12.01  ...    32.3      32.6      32.0     1.116   
2        8.6        0.7  12.21  ...    32.1      32.4      31.7     1.111   
3        8.2        0.7  11.56  ...    32.0      32.4      31.6     1.108   
4        7.9        0.6  11.03  ...    32.5      32.8      32.1     1.106   

   DEPND  DEPND0_14  DEPND65_     lag1     lag2     lag3  
0   38.0       24.6  

In [23]:
from sklearn.model_selection import TimeSeriesSplit
tss = TimeSeriesSplit(n_splits=3)
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

In [24]:
fold = 0
preds = []
scores = []
regs = []
for train_idx, val_idx in tss.split(valid_df):
    train = valid_df.iloc[train_idx]
    test = valid_df.iloc[val_idx]


    TARGET = 'POP'

    X_train = train.drop([TARGET], axis=1)
    y_train = train[TARGET]

    X_test = test.drop([TARGET], axis=1)
    y_test = test[TARGET]
    reg = DecisionTreeRegressor(random_state=0)
    reg.fit(X_train, y_train)
    regs.append(reg)
    y_pred = reg.predict(X_test)
    preds.append(y_pred)
    score = np.sqrt(mean_squared_error(y_test, y_pred))
    scores.append(score)


#printing scores for each fold

print(f'Score across folds {np.mean(scores):0.4f}')
print(f'Fold scores:{scores}')


Score across folds 6908045.8490
Fold scores:[17852901.68895095, 1035413.365618866, 1835822.492578255]


In [25]:
grouped = new_frame.groupby("GEO_ID")

In [26]:
counter = 0
for country, group in grouped:
    print(f"Country: {country}")
    print(group["POP"])
    print()  # Add a blank line for readability
    counter += 1
    good_form = group.drop(["GEO_ID", "POP"], axis=1)
    good_form = good_form[good_form["#YR"] <= 2023]
    y_pred = regs[1].predict(good_form)
    print(y_pred)
    if counter > 2:
        break

Country: W140000WOAD
0      52747
1      56475
2      59722
3      62559
4      63826
       ...  
106    52592
107    52256
108    51919
109    51582
110    51243
Name: POP, Length: 111, dtype: int64

[52747. 56475. 59722. 62559. 63826. 63245. 63358. 64053. 64728. 65073.
 65099. 65316. 65986. 68782. 73247. 76122. 78294. 80757. 82631. 83904.
 84563. 84888. 85164. 85389. 85563. 85690. 85771. 85811. 85813. 85782.
 85722. 85645. 85560. 85468.]
Country: W140000WOAE
111     2369781
112     2536143
113     2704660
114     2876234
115     3051363
         ...   
212    15024060
213    15055416
214    15085746
215    15115076
216    15143404
Name: POP, Length: 106, dtype: int64

[2369781. 2536143. 2704660. 2876234. 3051363. 3230742. 3402348. 3555027.
 3693043. 3819555. 3935820. 4458540. 5532409. 7159278. 8032709. 8020877.
 8302211. 8606553. 8930981. 9170474. 9308415. 9432801. 9543192. 9638206.
 9720156. 9792173. 9856612. 9915803. 9973449.]
Country: W140000WOAF
217    15555612
218    15044289
2

In [27]:
import pickle
with open("regtree", "bw") as f:
    pickle.dump(regs[1], f)

In [30]:
#Here we do the back-prediction
grouped = df.groupby('GEO_ID', as_index=False)
fixed = grouped.apply(add_next_years)
new_frame = fixed.reset_index()
new_frame = new_frame.drop(["level_0", "level_1"], axis=1)
no_country_df = new_frame.drop(["GEO_ID"], axis=1)
valid_df = no_country_df[no_country_df["#YR"] <= 2023]
fold = 0
preds = []
scores = []
regs = []
for train_idx, val_idx in tss.split(valid_df):
    train = valid_df.iloc[train_idx]
    test = valid_df.iloc[val_idx]


    TARGET = 'goal'

    X_train = train.drop([TARGET], axis=1)
    y_train = train[TARGET]

    X_test = test.drop([TARGET], axis=1)
    y_test = test[TARGET]
    reg = DecisionTreeRegressor(random_state=0)
    reg.fit(X_train, y_train)
    regs.append(reg)
    y_pred = reg.predict(X_test)
    preds.append(y_pred)
    score = np.sqrt(mean_squared_error(y_test, y_pred))
    scores.append(score)


#printing scores for each fold

print(f'Score across folds {np.mean(scores):0.4f}')
print(f'Fold scores:{scores}')


Score across folds 35868141.3919
Fold scores:[89520447.04183811, 8043376.25554934, 10040600.87844035]


In [31]:
clean_grouped = new_frame.groupby("GEO_ID")
counter = 0
for country, group in clean_grouped:
    print(f"Country: {country}")
    print(group["POP"])
    print()  # Add a blank line for readability
    counter += 1
    good_form = group.drop(["GEO_ID", "goal"], axis=1)
    good_form = good_form[good_form["#YR"] <= 2023]
    y_pred = regs[1].predict(good_form)
    print(y_pred)
    if counter > 2:
        break

Country: W140000WOAD
0      52747
1      56475
2      59722
3      62559
4      63826
       ...  
106    52592
107    52256
108    51919
109    51582
110    51243
Name: POP, Length: 111, dtype: int64

[    0. 52747. 56475. 59722. 62559. 63826. 63245. 63358. 64053. 64728.
 65073. 65099. 65316. 65986. 68782. 73247. 76122. 78294. 80757. 82631.
 83904. 84563. 84888. 85164. 85389. 85563. 85690. 85771. 85811. 85813.
 85782. 85722. 85645. 85560.]
Country: W140000WOAE
111     2369781
112     2536143
113     2704660
114     2876234
115     3051363
         ...   
212    15024060
213    15055416
214    15085746
215    15115076
216    15143404
Name: POP, Length: 106, dtype: int64

[      0. 2369781. 2536143. 2704660. 2876234. 3051363. 3230742. 3402348.
 3555027. 3693043. 3819555. 3935820. 4458540. 5532409. 7159278. 8032709.
 8020877. 8302211. 8606553. 8930981. 9170474. 9308415. 9432801. 9543192.
 9638206. 9720156. 9792173. 9856612. 9915803.]
Country: W140000WOAF
217    15555612
218    15044289
2