In [6]:
from sqlalchemy import create_engine, inspect
import os
import pandas as pd
from pandas_profiling import ProfileReport
import numpy as np

In [7]:
db_url = 'postgresql+psycopg2://postgres:root@localhost:5432/postgres'
engine = create_engine(db_url)
conn = engine.connect()
inspector = inspect(engine)
inspector.get_table_names()
df = pd.read_sql_table('WGI_20220101', conn)

In [8]:
def preprocess(df, years = [2000, 2008, 2006]):
    """
    Remove rows that only contain NA's, impute the others. Applies dimension reduction to two dimensions. Rename index to countryname and year.
    
    input: dataframe, list of years to analyze
    output: list of preprocessed dataframes per year
    """
    from sklearn.impute import SimpleImputer

    df['year'] = df['year'].astype(str)
    numeric_cols = df.select_dtypes(include=np.number).columns.tolist()

    keep = df[numeric_cols].dropna(how = 'all').index
    df = df.iloc[keep,]

    df_name_year = df.filter(items=['countryname', 'year']).reset_index(drop=True)
    df_num = df[numeric_cols].reset_index(drop=True)

    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp_mean.fit(df_num)
    df_num = pd.DataFrame(imp_mean.transform(df_num), columns = df_num.columns)

    df = df_name_year.join(df_num)

    df["country_year"] = df["year"].astype(str) + '_' + df["countryname"]
    index = df["country_year"]
    df = df.set_index(index)  

    res = []
    for year in years:
        df_year = df[df['year'] == str(year)]
        df_year = df_year[numeric_cols]
        res.append(df_year)
    

    return res

In [65]:
def cluster(list_of_dataframes, nr_of_clusters = 5):
    """
    Cluster each dataframe in list.
    input: List of dataframes, number of desired clusters per dataframe
    output: 
    """
    import math
    from sklearn.decomposition import PCA
    from sklearn.cluster import KMeans
    from sklearn.cluster import DBSCAN
    import matplotlib.pyplot as plt
    from seaborn import lmplot
    from ipywidgets import interact
    %matplotlib inline

    clustered_df = []

    for idx,df in enumerate(list_of_dataframes):

        pca = PCA(n_components=2)

        index = df.index
        X =pca.fit_transform(df)

        pca_res = pd.DataFrame(data = X
                    , columns = ['pc1', 'pc2'])
                    
        df = pca_res.set_index(index)

        kmeans = KMeans(n_clusters=nr_of_clusters, random_state=0).fit(df)
        df['cluster'] = kmeans.labels_

        clustered_df.append(df)

        lmplot(x='pc1', y='pc2', data=df, hue='cluster', fit_reg=False)

        year = df.index[0].split("_")[0]

        plt.title('{}'.format(year))
        plt.savefig("clusters_{}.png".format(year)) 

        for idx2,clus in enumerate(range(nr_of_clusters)):
            groups = list(df[df['cluster'] == clus].index)
            print(groups)
    