In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
%config InlineBackend.figure_format = 'png' #set 'png' here when working on notebook
warnings.filterwarnings('ignore') 

# Set some parameters to get good visuals - style to ggplot and size to 15,10

pd.set_option('display.width',170, 'display.max_rows',200, 'display.max_columns',900)

In [2]:
df = pd.read_csv("uk_data.csv")

In [3]:
df.columns

Index(['Year', 'Access to clean fuels and technologies for cooking  (% of population)', 'Access to electricity (% of population)',
       'Access to electricity, rural (% of rural population)', 'Access to electricity, urban (% of urban population)',
       'Adjusted net enrollment rate, primary (% of primary school age children)', 'Adjusted net national income (annual % growth)',
       'Adjusted net national income (constant 2010 US$)', 'Adjusted net national income (current US$)', 'Adjusted net national income per capita (annual % growth)',
       ...
       'Unemployment, youth total (% of total labor force ages 15-24) (national estimate)', 'Urban population', 'Urban population (% of total)',
       'Urban population growth (annual %)', 'Vulnerable employment, female (% of female employment) (modeled ILO estimate)',
       'Vulnerable employment, male (% of male employment) (modeled ILO estimate)', 'Vulnerable employment, total (% of total employment) (modeled ILO estimate)',
      

In [4]:
df.shape

(45, 872)

In [5]:
import re
missing_values = []
nonumeric_values = []

print ("DATA SET INFORMATION")
print ("========================\n")

for column in df:
    # Find all the unique feature values
    uniq = df[column].unique()
    #print ("'{}' has {} unique values" .format(column,uniq.size))
    #if (uniq.size > 10):
        #print("~~Listing up to 10 unique values~~")
    #print (uniq[0:10])
    #print ("\n-----------------------------------------------------------------------\n")
    
    # Find features with missing values
    if (True in pd.isnull(uniq)):
        s = "{} has {} missing" .format(column, pd.isnull(df[column]).sum())
        missing_values.append(s)
    
    # Find features with non-numeric values
    #for i in range (1, np.prod(uniq.shape)):
        #if (re.match('nan', str(uniq[i]))):
            #break
        #if not (re.search('(^\d+\.?\d*$)|(^\d*\.?\d+$)', str(uniq[i]))):
            #nonumeric_values.append(column)
            #break
  
#print ("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
print ("Features with missing values:\n{}\n\n" .format(list(missing_values)))
print(len(missing_values))
#print ("Features with non-numeric values:\n{}" .format(nonumeric_values))
#print ("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")



DATA SET INFORMATION

Features with missing values:
['Access to clean fuels and technologies for cooking  (% of population) has 28 missing', 'Access to electricity (% of population) has 18 missing', 'Access to electricity, rural (% of rural population) has 18 missing', 'Access to electricity, urban (% of urban population) has 18 missing', 'Adjusted net enrollment rate, primary (% of primary school age children) has 7 missing', 'Adjusted net national income (annual % growth) has 1 missing', 'Adjusted net national income (constant 2010 US$) has 1 missing', 'Adjusted net national income (current US$) has 1 missing', 'Adjusted net national income per capita (annual % growth) has 1 missing', 'Adjusted net national income per capita (constant 2010 US$) has 1 missing', 'Adjusted net national income per capita (current US$) has 1 missing', 'Adjusted net savings, excluding particulate emission damage (% of GNI) has 1 missing', 'Adjusted net savings, excluding particulate emission damage (curren

In [6]:
# cols_to_delete = df.columns[df.isnull().sum()/len(df) > .30]
# df.drop(cols_to_delete, axis = 1, inplace = True)
# df.shape

#### Remove the columns with more than 30% missing values 

In [7]:
df = df.loc[:, df.isnull().sum() < 0.30*df.shape[0]]

df.shape

(45, 467)

In [8]:
import re
missing_values = []
nonumeric_values = []

print ("DATA SET INFORMATION")
print ("========================\n")

for column in df:
    # Find all the unique feature values
    uniq = df[column].unique()
    #print ("'{}' has {} unique values" .format(column,uniq.size))
    #if (uniq.size > 10):
        #print("~~Listing up to 10 unique values~~")
    #print (uniq[0:10])
    #print ("\n-----------------------------------------------------------------------\n")
    
    # Find features with missing values
    if (True in pd.isnull(uniq)):
        s = "{} has {} missing" .format(column, pd.isnull(df[column]).sum())
        missing_values.append(s)
    
    # Find features with non-numeric values
    #for i in range (1, np.prod(uniq.shape)):
        #if (re.match('nan', str(uniq[i]))):
            #break
        #if not (re.search('(^\d+\.?\d*$)|(^\d*\.?\d+$)', str(uniq[i]))):
            #nonumeric_values.append(column)
            #break
  
#print ("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
print ("Features with missing values:\n{}\n\n" .format(list(missing_values)))
print(len(missing_values))
#print ("Features with non-numeric values:\n{}" .format(nonumeric_values))
#print ("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")


DATA SET INFORMATION

Features with missing values:
['Adjusted net enrollment rate, primary (% of primary school age children) has 7 missing', 'Adjusted net national income (annual % growth) has 1 missing', 'Adjusted net national income (constant 2010 US$) has 1 missing', 'Adjusted net national income (current US$) has 1 missing', 'Adjusted net national income per capita (annual % growth) has 1 missing', 'Adjusted net national income per capita (constant 2010 US$) has 1 missing', 'Adjusted net national income per capita (current US$) has 1 missing', 'Adjusted net savings, excluding particulate emission damage (% of GNI) has 1 missing', 'Adjusted net savings, excluding particulate emission damage (current US$) has 1 missing', 'Adjusted savings: carbon dioxide damage (% of GNI) has 1 missing', 'Adjusted savings: carbon dioxide damage (current US$) has 1 missing', 'Adjusted savings: consumption of fixed capital (% of GNI) has 1 missing', 'Adjusted savings: consumption of fixed capital (cu

In [11]:
df.to_csv("clean_uk.csv")

http://www.economicsonline.co.uk/Global_economics/Economic_development.html

The Human Development Index (HDI)
The HDI was introduced in 1990 as part of the United Nations Development Programme (UNDP) to provide a means of measuring economic development in three broad areas - per capita income, heath and education. The HDI tracks changes in the level of development of countries over time.

Each year, the UNDP produces a development report, which provides an update of changes during the year, along with a report on a special theme, such as global warming and development, and migration and development.

The introduction of the index was an explicit acceptance that development is a considerably broader concept than growth, and should include a range of social and economic factors.

The HDI has two main features:
A scale from 0 (no development) to 1 (complete development).

An index, which is based on three equally weighted components:

Longevity, measured by life expectancy at birth
Knowledge,  measured by adult literacy and number of years children are enrolled at school
Standard of living,  measured by real GDP per capita at purchasing power parity
What the figures mean:
An index of 0 – 0.49 means low development - for example, Nigeria was 0.42 in 2010.

An index of 0.5 – 0.69 means medium development – for example, Indonesia was 0.6. 3.

An index of 0.7 to 0.79 means high development – for example, Romania was 0.76. 4.

Above 0.8 means very high development – Finland was 0.87 in 2010.

The HDI is a very useful means of comparing the level of development of countries. GDP per capita alone is clearly too narrow an indicator of economic development and fails to indicate other aspects of development, such as enrolment in school and longevity. Hence, the HDI is a broader and more encompassing indicator of development than GDP, though GDP still provides one third of the index.

Life expectancy
A variety of factors may contribute to differences in life expectancy, including:

The stability of food supplies

War

The incidence of disease and natural disasters

According to World Bank figures, life expectancy at birth in developing countries over the past 40 years has increased by 20 years. However, these increases were not evenly distributed. Indeed, in many countries in sub-Saharan Africa, life expectancy is falling due to the AIDS epidemic.

Adult literacy
The percentage of those aged 15 and above who are able to read and write a simple statement on their everyday life.

More extensive definitions of literacy include those based on the International Adult Literacy Survey. This survey tests the ability to understand text, interpret documents and perform basic arithmetic.

GDP per capita
GDP per capita is the commonest indicator of material standards of living, and hence is included in the index of development. GDP per capita It is found by measuring Gross Domestic Product in a year, and dividing it by the population.

https://www.bbc.com/bitesize/guides/zs7wrdm/revision/3
    
### Economic development indicators
To assess the economic development of a country, geographers use economic indicators including:

* Gross Domestic Product (GDP) is the total value of goods and services produced by a country in a year.
* Gross National Product (GNP) measures the total economic output of a country, including earnings from foreign investments.
* GNP per capita is a country's GNP divided by its population. (Per capita means per person.)
* Economic growth measures the annual increase in GDP, GNP, GDP per capita, or GNP per capita.
* Inequality of wealth is the gap in income between a country's richest and poorest people. It can be measured in many ways, (eg the proportion of a country's wealth owned by the richest 10 per cent of the population, compared with the proportion owned by the remaining 90 per cent).
* Inflation measures how much the prices of goods, services and wages increase each year. High inflation (above a few percent) can be a bad thing, and suggests a government lacks control over the economy.
* Unemployment is the number of people who cannot find work.
* Economic structure shows the division of a country's economy between primary, secondary and tertiary industries.
* Demographics study population growth and structure. It compares birth rates to death rates, life expectancy and urban and rural ratios. Many LEDCs have a younger, faster-growing population than MEDCs, with more people living in the countryside than in towns. The birth rate in the UK is 11 per 1,000, whereas in Kenya it is 40.


### Human development indicators
Development often takes place in an uneven way. A country may have a very high GDP - derived, for example, from the exploitation of rich oil reserves - while segments of the population live in poverty and lack access to basic education, health and decent housing.

Hence the importance of human development indicators, measuring the non-economic aspects of a country's development.

###### Human development indicators include:

* Life expectancy - the average age to which a person lives, eg this is 79 in the UK and 48 in Kenya.
* Infant mortality rate - counts the number of babies, per 1000 live births, who die under the age of one. This is 5 in the UK and 61 in Kenya.
* Poverty - indices count the percentage of people living below the poverty level, or on very small incomes (eg under £1 per day).
* Access to basic services - the availability of services necessary for a healthy life, such as clean water and sanitation.
* Access to healthcare - takes into account statistics such as how many doctors there are for every patient.
* Risk of disease - calculates the percentage of people with diseases such as AIDS, malaria and tuberculosis.
* Access to education - measures how many people attend primary school, secondary school and higher education.
* Literacy rate - is the percentage of adults who can read and write. This is 99 per cent in the UK, 85 per cent in Kenya and 60 per cent in India.
* Access to technology - includes statistics such as the percentage of people with access to phones, mobile phones, television and the internet.
* Male/female equality - compares statistics such as the literacy rates and employment between the sexes.
* Government spending priorities - compares health and education expenditure with military expenditure and paying off debts.

##### MatplotLib 

In [None]:
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (100, 50)
# plot the data
plt.plot(dfip['Year'],dfip['Total Population'],color ='blue') 
# We change the fontsize of minor ticks label 
plt.xticks(fontsize=30)
plt.yticks(fontsize=30)
plt.title('Total Population in Irealnd from 1960 -2017', fontsize= 15)
plt.xlabel('Year', fontsize= 15)
plt.ylabel('Total Population', fontsize= 15)
plt.grid(which='both')
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = (20, 10)
dfip.hist(column="Total Population",        # Column to plot
              figsize=(20,20),         # Plot size
              color="blue",          # Plot color
              bins= 5)#,               # Use 50 bins
              #range= (0,3.5))        # Limit x-axis range

###### Seaborne

In [None]:
sns.set(style="whitegrid")
# Initialize the matplotlib figure
f, ax = plt.subplots(figsize=(30, 4))

# Plot the total crashes
sns.set_color_codes("deep")
sns.lineplot(x="Year", y="Total Population", data=dfip, color="r")
#sns.despine()
plt.title('Total Population in Irealnd from 1960 -2017', fontsize= 15)
plt.xlabel('Year', fontsize= 15)
plt.ylabel('Total Population', fontsize= 15)
plt.show()

https://stackoverflow.com/questions/48225888/matplotlib-finance-candlestick-ohlc-plot-intraday-1min-bar-data-with-time-breaks

https://jakevdp.github.io/PythonDataScienceHandbook/04.08-multiple-subplots.html



In [None]:

f, (ax1, ax2) = plt.subplots(2,figsize=(50, 10))
sns.boxplot(x= "Total Population", data=dfip, ax=ax1)
sns.lineplot(x="Year", y="Total Population", data=dfip, ax=ax2)

In [None]:
y_title_margin = 1.2
sns.set(style="white",palette='muted', color_codes=True)
    #rs = np.random.RandomState(10)

    # Set up the matplotlib figure
f, (ax1,ax2,ax3) = plt.subplots(nrows = 3,figsize=(12, 10), sharex=True)
left   =  0.125  # the left side of the subplots of the figure
right  =  0.9    # the right side of the subplots of the figure
bottom =  0.1    # the bottom of the subplots of the figure
top    =  0.9    # the top of the subplots of the figure
wspace =  .5     # the amount of width reserved for blank space between subplots
hspace =  1.1    # the amount of height reserved for white space between subplots

    # This function actually adjusts the sub plots using the above paramters
plt.subplots_adjust(
    left    =  left, 
    bottom  =  bottom, 
    right   =  right, 
    top     =  top, 
    wspace  =  wspace, 
    hspace  =  hspace
)
sns.despine(left=True)

#facetgrid
facet = sns.FacetGrid(dfip, hue="PROC_FLAGS", palette = "Set1");
# #ax1
facet.map(sns.kdeplot,'Total Population',shade= True , bw = 30, kernel='gau', ax = ax1);
#ax2
facet.map(sns.kdeplot,'COV_ALLW_CONTR_AMT_sum',shade= True,bw = 20, ax = ax2);
#ax3
facet.map(sns.kdeplot,'chrg_allw_sum_diff',shade= True, bw = 30,ax = ax3);

#setting stitle name
ax1.set_title("Sum of Charge amount",  y = y_title_margin, fontsize=14)
ax2.set_title("Sum of Allowed amount", y = y_title_margin, fontsize=14)
ax3.set_title("Difference between 'Sum of Charge amount' and 'Sum of Allowed amount'",  y = y_title_margin, fontsize=14)

#ax1
ax1.set_xlabel("Dollars")
ax1.set_ylabel("Probability Distribution")
ax1.set_xlim(-200,300)
ax1.set_ylim(0,0.018)
# title
new_title = 'Type of Claim'
ax1.legend_.set_title(new_title)
# replace labels
new_labels = ['Non-resubmitted','Resubmitted']
for t, l in zip(ax1.legend_.texts, new_labels): t.set_text(l)

#ax2

ax2.set_xlabel("Dollars")
ax2.set_ylabel("Probability Distribution")
ax2.set_xlim(-200,300)
ax2.set_ylim(0,0.018)
#title
new_title = 'Type of Claim'
ax2.legend_.set_title(new_title)
# replace labels
new_labels = ['Non-resubmitted','Resubmitted']
for t, l in zip(ax2.legend_.texts, new_labels): t.set_text(l)
#ax3
ax3.set_xlabel("Dollars")
ax3.set_ylabel("Probability Distribution")
ax3.set_xlim(-200,300)
ax3.set_ylim(0,0.018)
#title
new_title = 'Type of Claim'
ax3.legend_.set_title(new_title)
# replace labels
new_labels = ['Non-resubmitted','Resubmitted']
for t, l in zip(ax3.legend_.texts, new_labels): t.set_text(l)

plt.tight_layout()
plt.close()

