In [10]:
def years_dimension(colname='id'):
    import datetime
    import numpy as np
    import pandas as pd
    
    starting_year = 1900
    current_year = int(datetime.datetime.now().date().strftime('%Y'))
    year_list = np.arange(starting_year, current_year+1)
    df = pd.DataFrame(year_list, columns=[colname])
    return df

In [11]:
def create_schema(db_url, schema_path):
    """
    Create schema in the scpecified db.
    """
    import sqlalchemy
    from sqlalchemy import create_engine

    schema = open(schema_path)
    engine = create_engine(db_url)

    escaped_sql = sqlalchemy.text(schema.read())
    engine.execute(escaped_sql)

    return

In [12]:
def populate_schema(db_url_source, db_url_schema):
    """
    Populate schema with the proper data.
    """
    import numpy as np
    import pandas as pd
    import sqlalchemy
    from sqlalchemy import create_engine
    from sqlalchemy import inspect
    from sqlalchemy import MetaData
    from sqlalchemy.dialects.postgresql import insert as pg_insert

    engine_schema = create_engine(db_url_schema)
    schema_conn = engine_schema.connect()
    inspector = inspect(engine_schema)
    schema_table_names = inspector.get_table_names()

    engine_source = create_engine(db_url_source)
    source_conn = engine_source.connect()

    facts = []
    for table in schema_table_names:
        if 'dim' not in table:
            facts.append(table)
            continue

        else: 
            if table == 'years_dim':
                data = years_dimension().to_dict('records')
            
            else:
                df_schema = pd.read_sql_table(table, schema_conn)
                df_source = pd.read_sql_table(table, source_conn)

                cols = np.intersect1d(df_schema.columns, df_source.columns)
                data = df_source[cols].to_dict('records')

            meta = MetaData()
            meta.reflect(bind=engine_schema)   
            stmt = pg_insert(meta.tables[table]).values(data).on_conflict_do_nothing()
            schema_conn.execute(stmt)

    for table in facts:
        df_schema = pd.read_sql_table(table, schema_conn)
        df_source = pd.read_sql_table(table, source_conn)

        cols = np.intersect1d(df_schema.columns, df_source.columns)
        print(cols)
        data = df_source[cols].to_dict('records')

        meta = MetaData()
        meta.reflect(bind=engine_schema)   
        stmt = pg_insert(meta.tables[table]).values(data).on_conflict_do_nothing()
        schema_conn.execute(stmt)

    source_conn.close()
    schema_conn.close()
    return

In [13]:
db_url = 'postgresql+psycopg2://postgres:root@localhost:5432/cluster_countries' 
create_schema(db_url, schema_path='./schemas/conflict_schema.sql')

In [14]:
db_url_source = 'postgresql+psycopg2://postgres:root@localhost:5432/adsdb'
db_url_schema = 'postgresql+psycopg2://postgres:root@localhost:5432/cluster_countries'
populate_schema(db_url_source, db_url_schema)

['cce' 'code' 'gee' 'pve' 'rle' 'rqe' 'vae' 'year']
['bx_gsr_gnfs_cd' 'economy' 'ne_imp_gnfs_kn' 'ny_gdp_mktp_cd'
 'si_pov_gini' 'sl_tlf_0714_zs' 'sp_ado_tfrt' 'sp_dyn_le00_fe_in'
 'sp_dyn_le00_ma_in' 'time']


In [54]:
# Initialize the engine and df

from sqlalchemy import create_engine
import pandas as pd

engine = create_engine(db_url_source)
conn = engine.connect()

df = pd.read_sql_table('wbd', conn)
dff= pd.read_sql_table('wbd', conn)
dg = pd.read_sql_table('countries_dim', conn)
conn.close()

In [52]:
len(df['economy'].unique())

39

In [53]:
for code in df['economy'].unique():
    if code not in dg['iso'].unique():
        print(code)

HKG
MAC


In [49]:
print(dg)

     Global Code Global Name  Region Code continent  Sub-region Code  \
0              1       World          2.0    Africa             15.0   
1              1       World          2.0    Africa             15.0   
2              1       World          2.0    Africa             15.0   
3              1       World          2.0    Africa             15.0   
4              1       World          2.0    Africa             15.0   
5              1       World          2.0    Africa             15.0   
6              1       World          2.0    Africa             15.0   
7              1       World          2.0    Africa            202.0   
8              1       World          2.0    Africa            202.0   
9              1       World          2.0    Africa            202.0   
10             1       World          2.0    Africa            202.0   
11             1       World          2.0    Africa            202.0   
12             1       World          2.0    Africa            2

In [33]:
# Initialize the engine and df

from sqlalchemy import create_engine
import pandas as pd

engine = create_engine(db_url_schema)
conn = engine.connect()

dg = pd.read_sql_table('wbd', conn)

conn.close()

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 741 entries, 0 to 740
Data columns (total 38 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   economy            741 non-null    object 
 1   time               741 non-null    int64  
 2   BX_GSR_GNFS_CD     606 non-null    float64
 3   EN_POP_SLUM_UR_ZS  69 non-null     float64
 4   NE_IMP_GNFS_KN     492 non-null    float64
 5   NY_GDP_MKTP_CD     698 non-null    float64
 6   SI_DST_02ND_20     128 non-null    float64
 7   SI_DST_03RD_20     128 non-null    float64
 8   SI_DST_04TH_20     128 non-null    float64
 9   SI_DST_05TH_20     128 non-null    float64
 10  SI_DST_10TH_10     128 non-null    float64
 11  SI_DST_50MD        96 non-null     float64
 12  SI_DST_FRST_10     128 non-null    float64
 13  SI_DST_FRST_20     128 non-null    float64
 14  SI_POV_DDAY        128 non-null    float64
 15  SI_POV_GAPS        128 non-null    float64
 16  SI_POV_GINI        128 non

In [37]:
dg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 741 entries, 0 to 740
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   economy            741 non-null    object 
 1   time               741 non-null    int64  
 2   si_pov_gini        0 non-null      float64
 3   ne_imp_gnfs_kn     0 non-null      float64
 4   bx_gsr_gnfs_cd     0 non-null      float64
 5   ny_gdp_mktp_cd     0 non-null      float64
 6   sp_ado_tfrt        0 non-null      float64
 7   sl_tlf_0714_zs     0 non-null      float64
 8   sp_dyn_le00_fe_in  0 non-null      float64
 9   sp_dyn_le00_ma_in  0 non-null      float64
dtypes: float64(8), int64(1), object(1)
memory usage: 58.0+ KB
