## Example 1-1. Training and running a linear model using Scikit-Learn

In [1]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn

In [16]:
#Import datasets
oecd_bli = pd.read_csv('dataset/better_life/BLI_25082018120323618.csv', thousands=',')
gdp_per_cap = pd.read_csv('dataset/better_life/WEO_Data.csv', thousands=',', encoding='latin1')

In [3]:
#Display the contents of the dataframe
gdp_per_cap[:5]

Unnamed: 0,Country,Subject Descriptor,Units,Scale,Country/Series-specific Notes,2015,Estimates Start After
0,Afghanistan,"Gross domestic product per capita, current prices",U.S. dollars,Units,"See notes for: Gross domestic product, curren...",599.994,2013.0
1,Albania,"Gross domestic product per capita, current prices",U.S. dollars,Units,"See notes for: Gross domestic product, curren...",3995.38,2010.0
2,Algeria,"Gross domestic product per capita, current prices",U.S. dollars,Units,"See notes for: Gross domestic product, curren...",4318.14,2014.0
3,Angola,"Gross domestic product per capita, current prices",U.S. dollars,Units,"See notes for: Gross domestic product, curren...",4100.32,2014.0
4,Antigua and Barbuda,"Gross domestic product per capita, current prices",U.S. dollars,Units,"See notes for: Gross domestic product, curren...",14414.3,2011.0


In [4]:
#check if there are columns with NA
gdp_per_cap.count()

Country                          190
Subject Descriptor               189
Units                            189
Scale                            189
Country/Series-specific Notes    188
2015                             187
Estimates Start After            188
dtype: int64

In [5]:
#Check which which row has the missing column value 
gdp_per_cap[gdp_per_cap['2015'].isnull()]

Unnamed: 0,Country,Subject Descriptor,Units,Scale,Country/Series-specific Notes,2015,Estimates Start After
88,Kosovo,"Gross domestic product per capita, current prices",U.S. dollars,Units,,,
163,Syria,"Gross domestic product per capita, current prices",U.S. dollars,Units,"See notes for: Gross domestic product, curren...",,2010.0
189,,,,,,,
190,"International Monetary Fund, World Economic Ou...",,,,,,


In [6]:
#Drop all rows that contain null values
gdp = gdp_per_cap.dropna()
gdp.count()

Country                          187
Subject Descriptor               187
Units                            187
Scale                            187
Country/Series-specific Notes    187
2015                             187
Estimates Start After            187
dtype: int64

In [7]:
gdp.columns

Index(['Country', 'Subject Descriptor', 'Units', 'Scale',
       'Country/Series-specific Notes', '2015', 'Estimates Start After'],
      dtype='object')

In [9]:
#Copy only the columns needed
gdp = gdp[['Country', '2015']]
gdp[:5]

Unnamed: 0,Country,2015
0,Afghanistan,599.994
1,Albania,3995.38
2,Algeria,4318.14
3,Angola,4100.32
4,Antigua and Barbuda,14414.3


In [17]:
#Display the contents of the dataframe
oecd_bli[:5]

Unnamed: 0,LOCATION,Country,INDICATOR,Indicator,MEASURE,Measure,INEQUALITY,Inequality,Unit Code,Unit,PowerCode Code,PowerCode,Reference Period Code,Reference Period,Value,Flag Codes,Flags
0,AUS,Australia,JE_LMIS,Labour market insecurity,L,Value,TOT,Total,PC,Percentage,0,Units,,,4.3,,
1,AUT,Austria,JE_LMIS,Labour market insecurity,L,Value,TOT,Total,PC,Percentage,0,Units,,,2.7,,
2,BEL,Belgium,JE_LMIS,Labour market insecurity,L,Value,TOT,Total,PC,Percentage,0,Units,,,4.8,,
3,CAN,Canada,JE_LMIS,Labour market insecurity,L,Value,TOT,Total,PC,Percentage,0,Units,,,3.9,,
4,CZE,Czech Republic,JE_LMIS,Labour market insecurity,L,Value,TOT,Total,PC,Percentage,0,Units,,,1.8,,


In [18]:
#show the Dataframe columns
oecd_bli.columns

Index(['LOCATION', 'Country', 'INDICATOR', 'Indicator', 'MEASURE', 'Measure',
       'INEQUALITY', 'Inequality', 'Unit Code', 'Unit', 'PowerCode Code',
       'PowerCode', 'Reference Period Code', 'Reference Period', 'Value',
       'Flag Codes', 'Flags'],
      dtype='object')

In [20]:
#Display unique values of the column
oecd_bli.Unit.unique()

array(['Percentage', 'Average score', 'Ratio', 'US Dollar', 'Years',
       'Micrograms per cubic metre', 'Hours'], dtype=object)

In [25]:
#df = df.loc[df.column == 'somevalue']
oecd_bli = oecd_bli.loc[oecd_bli['Unit']=='Average score']

In [26]:
oecd_bli.count()

LOCATION                 498
Country                  498
INDICATOR                498
Indicator                498
MEASURE                  498
Measure                  498
INEQUALITY               498
Inequality               498
Unit Code                498
Unit                     498
PowerCode Code           498
PowerCode                498
Reference Period Code      0
Reference Period           0
Value                    498
Flag Codes                84
Flags                     84
dtype: int64

In [27]:
#Save only the columns needed for analysis
oecd_df = oecd_bli[['Country', 'Value']].reset_index(drop=True)
oecd_df[:5]

Unnamed: 0,Country,Value
0,Australia,2.7
1,Austria,1.3
2,Belgium,2.2
3,Canada,3.0
4,Czech Republic,2.6


In [28]:
oecd_df.count()

Country    498
Value      498
dtype: int64

### Merge the Dataframes

In [None]:
# Prepare the data
country_stats = prepare_country_stats(oecd_bli,gdp_per_cap)

In [None]:
print(oecd_bli.shape)
print(gdp_per_cap.shape)

In [None]:
oecd_bli.isnull().sum()