In [None]:
import pandas as pd
import seaborn as sns
import matplotlib 
import matplotlib.pyplot as plt

In [None]:
df1_ = pd.read_csv('../data/oecd_education_pisaread.csv')
df1_ = df1_.query('TIME == 2018 & SUBJECT == "TOT"')
df1_['read_rank'] = df1_['Value'].rank(ascending=False, method='first')
df_read = df1_[['LOCATION', 'read_rank']].copy()
df_read['read_rank_rev'] = -1*df_read['read_rank']

In [None]:
df1_.query('LOCATION=="ISR"')

In [None]:
df2_ = pd.read_csv('../data/oecd_education_pisamath.csv')
df2_ = df2_.query('TIME == 2018 & SUBJECT == "TOT"')
df2_['math_rank'] = df2_['Value'].rank(ascending=False, method='first')
df_math = df2_[['LOCATION', 'math_rank']].copy()
df_math['math_rank_rev'] = -1*df_math['math_rank']

In [None]:
df2_.query('LOCATION=="ISR"')

In [None]:
# table 4f9115fa-en from https://www.oecd-ilibrary.org/education/education-at-a-glance-2021_be9806af-en
expenditure = pd.read_excel('../data/oecd_education_expenditure.xlsx')

expenditure = expenditure.iloc[8:46, [0, 2]]

expenditure.columns = ['COUNTRY', 'expenditure']

# from https://gist.githubusercontent.com/radcliff/f09c0f88344a7fcef373/raw/2753c482ad091c54b1822288ad2e4811c021d8ec/wikipedia-iso-country-codes.csv

codes = pd.read_csv('../data/wikipedia-iso-country-codes.csv')

codes.rename(mapper = {'English short name lower case' : 'COUNTRY'}, axis = 'columns', inplace=True)

expenditure = expenditure.merge(codes, on='COUNTRY', how='left').rename(axis='columns', mapper={'Alpha-3 code' : 'LOCATION'})
expenditure

expenditure = expenditure[['LOCATION', 'expenditure', 'COUNTRY']]
expenditure['exp_bin'] = pd.qcut(expenditure['expenditure'], q=10, labels=False)
expenditure['expenditure_rank'] = expenditure['expenditure'].rank()

In [None]:
df = expenditure.merge(df_read, on='LOCATION').merge(df_math, on='LOCATION')

In [None]:
#isr_rank = df.query('LOCATION == "ISR"')['exp_bin'].iloc[0]
isr_rank = df.query('LOCATION == "ISR"')['expenditure_rank'].iloc[0]

In [None]:
#df2 = df.query('exp_bin == @isr_rank')
df2 = df.query('expenditure_rank < @isr_rank + 5 and expenditure_rank > @isr_rank - 5')


In [None]:
df2

In [None]:
df['flag'] = (df['LOCATION'] == "ISR")
sizes = [x*40 for x in range(1, 11)]

font = {'family' : 'normal',
        'weight' : 'bold',
        'size'   : 22}

matplotlib.rc('font', **font)

plt.rcParams["figure.figsize"] = (8,8)

llist = ['Israel', 'Canada', 'Finland', 'Netherlands', 'Japan', 'Estonia']

def rev(s):
    return s[::-1]

In [None]:
def scatter_text(x, y, text_column, data, xlabel, ylabel, size, hue, label_list, tick_low, tick_high, skip_xticks=False):
    """Scatter plot with country codes on the x y coordinates
       Based on this answer: https://stackoverflow.com/a/54789170/2641825"""
    # Create the scatter plot
    p1 = sns.scatterplot(x=x, y=y, data=data, size = size, sizes=sizes, hue=hue, legend=False)
    # Add text besides each point
    for line in range(0,data.shape[0]):
         txt = data[text_column][line]
         if txt in label_list:
            p1.text(data[x][line]+1, data[y][line], 
                    txt, horizontalalignment='left', 
                    size='small', color='black')
    # Set title and axis labels
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    xt = plt.yticks()[0]
    labels = len(xt)*['']
    labels[-1] = tick_low
    labels[0] = tick_high
    if not skip_xticks:
        plt.xticks(xt, labels)
    plt.yticks(xt, labels)
    return p1

In [None]:
scatter_text(data=df, x='read_rank', y='math_rank', size='exp_bin', hue='flag', text_column='COUNTRY', ylabel=rev('דירוג חשבון'), xlabel=rev('דירוג קריאה'), label_list = llist, tick_high=rev('מספר נמוך'), tick_low=rev('מספר גבוה'))

In [None]:
scatter_text(data=df, x='read_rank_rev', y='math_rank_rev', size='exp_bin', hue='flag', text_column='COUNTRY', ylabel=rev('דירוג חשבון'), xlabel=rev('דירוג קריאה'), label_list = llist, tick_high=rev('נמוך'), tick_low=rev('גבוה'))

In [None]:
plt.rcParams["figure.figsize"] = (8,5)

scatter_text(data=df, x='expenditure', y='math_rank_rev', size='exp_bin', hue='flag', text_column='COUNTRY', ylabel=rev('דירוג חשבון'), xlabel=rev('השקעה'), label_list = llist, tick_high=rev('נמוך'), tick_low=rev('גבוה'), skip_xticks=True)

In [None]:
scatter_text(data=df, x='expenditure', y='read_rank_rev', size='exp_bin', hue='flag', text_column='COUNTRY', ylabel=rev('דירוג קריאה'), xlabel=rev('השקעה'), label_list = llist, tick_high=rev('נמוך'), tick_low=rev('גבוה'), skip_xticks=True)
#sns.scatterplot(x='expenditure', y='math_rank_rev', data=df, size = 'exp_bin', sizes=sizes, hue='flag', legend=False)

In [None]:
expenditure.sort_values('expenditure')