# ESA4 

Load the countries.csv directly via URL import into your panda data frame!

In [16]:
import numpy as np
import pandas as pd

In [17]:
url = "https://raw.githubusercontent.com/edlich/eternalrepo/master/DS-WAHLFACH/countries.csv"
np_data = np.genfromtxt(url, delimiter=",", dtype=(object))
pd_data = pd.read_csv(url)

Display some basic information as rows, columns and some basic statistical info.

In [18]:
np.info(np_data)

class:  ndarray
shape:  (6, 5)
strides:  (40, 8)
itemsize:  8
aligned:  True
contiguous:  True
fortran:  False
data pointer: 0x555c6efe0160
byteorder:  little
byteswap:  False
type: object


In [19]:
print(pd_data.describe())

             People          Area         BIP
count  5.000000e+00  5.000000e+00     5.00000
mean   1.027863e+08  3.907400e+06  2716.20000
std    6.759970e+07  4.904957e+06  1457.86083
min    3.650310e+07  3.013380e+05  1529.00000
25%    6.050172e+07  3.573850e+05  1798.00000
50%    8.252165e+07  3.778350e+05  1850.00000
75%    1.260450e+08  8.515770e+06  3466.00000
max    2.083600e+08  9.984670e+06  4938.00000


Show the last 4 rows of the data_frame

In [20]:
np_data[-4: , :]

array([[b'Japan', b'126045000', b'377835', b'4938', b'YEN'],
       [b'Canada', b'36503097', b'9984670', b'1529', b'CAD'],
       [b'Italy', b'60501718', b'301338', b'1850', b'EUR'],
       [b'Brazilia', b'208360000', b'8515770', b'1798', b'REAL']],
      dtype=object)

Let's cut off the nasty header

In [21]:
country_data = np.genfromtxt(url, delimiter=",", skip_header=1, dtype=(object))
country_data

array([[b'Germany', b'82521653', b'357385', b'3466', b'EUR'],
       [b'Japan', b'126045000', b'377835', b'4938', b'YEN'],
       [b'Canada', b'36503097', b'9984670', b'1529', b'CAD'],
       [b'Italy', b'60501718', b'301338', b'1850', b'EUR'],
       [b'Brazilia', b'208360000', b'8515770', b'1798', b'REAL']],
      dtype=object)

Show all the row of countries who have the EURO

In [22]:
with_euro = country_data[:,4].astype(str) == 'EUR'
country_data[with_euro, :]

array([[b'Germany', b'82521653', b'357385', b'3466', b'EUR'],
       [b'Italy', b'60501718', b'301338', b'1850', b'EUR']], dtype=object)

Show only name and Currency in a new data frame

In [23]:
country_data[:,[0,4]]

array([[b'Germany', b'EUR'],
       [b'Japan', b'YEN'],
       [b'Canada', b'CAD'],
       [b'Italy', b'EUR'],
       [b'Brazilia', b'REAL']], dtype=object)

Show only the rows/countries that have more than 2000 BIP (it is in Milliarden USD Bruttoinlandsprodukt)

In [24]:
bip_over_2000 = country_data[:,3].astype(int) > 2000
country_data[bip_over_2000, :]

array([[b'Germany', b'82521653', b'357385', b'3466', b'EUR'],
       [b'Japan', b'126045000', b'377835', b'4938', b'YEN']], dtype=object)

Select all countries where with inhabitants between 50 and 150 Mio

In [25]:
inhabitants = country_data[:,1].astype(int) 
inhabitants_between_50_150_mio = (inhabitants >= 50000000) & (inhabitants <=150000000)
country_data[inhabitants_between_50_150_mio, :]

array([[b'Germany', b'82521653', b'357385', b'3466', b'EUR'],
       [b'Japan', b'126045000', b'377835', b'4938', b'YEN'],
       [b'Italy', b'60501718', b'301338', b'1850', b'EUR']], dtype=object)

Change BIP to Bip

In [26]:
np.where(np_data.astype(str) == "BIP", "Bip", np_data)

array([[b'Name', b'People', b'Area', 'Bip', b'Currency'],
       [b'Germany', b'82521653', b'357385', b'3466', b'EUR'],
       [b'Japan', b'126045000', b'377835', b'4938', b'YEN'],
       [b'Canada', b'36503097', b'9984670', b'1529', b'CAD'],
       [b'Italy', b'60501718', b'301338', b'1850', b'EUR'],
       [b'Brazilia', b'208360000', b'8515770', b'1798', b'REAL']],
      dtype=object)

Calculate the Bip sum

In [27]:
country_data[:,3].astype(int).sum()

13581

Calculate the average people of all countries

In [28]:
country_data[:,1].astype(int).mean()

102786293.6

Sort by name alphabetically

In [29]:
country_data[:,0].sort()
country_data

array([[b'Brazilia', b'82521653', b'357385', b'3466', b'EUR'],
       [b'Canada', b'126045000', b'377835', b'4938', b'YEN'],
       [b'Germany', b'36503097', b'9984670', b'1529', b'CAD'],
       [b'Italy', b'60501718', b'301338', b'1850', b'EUR'],
       [b'Japan', b'208360000', b'8515770', b'1798', b'REAL']],
      dtype=object)

Create a new data frame from the original where the area is changed as follows: all countries with > 1000000 get BIG and <= 1000000 get SMALL in the cell replaced!

In [30]:
country_data_area_mod = country_data.copy()
all_small = country_data_area_mod[:,2].astype(int) <= 1000000

country_data_area_mod[all_small, 2] = "SMALL"
country_data_area_mod[all_small == False, 2] = "BIG"

country_data_area_mod


array([[b'Brazilia', b'82521653', 'SMALL', b'3466', b'EUR'],
       [b'Canada', b'126045000', 'SMALL', b'4938', b'YEN'],
       [b'Germany', b'36503097', 'BIG', b'1529', b'CAD'],
       [b'Italy', b'60501718', 'SMALL', b'1850', b'EUR'],
       [b'Japan', b'208360000', 'BIG', b'1798', b'REAL']], dtype=object)