## Анализ набора данных с использованием `pandas`

### Загрузка данных и первичный обзор

В этом задании мы будем работать с набором данных `gapminder`, который содержит информацию о различных странах и их показателях развития. Мы будем использовать библиотеку Pandas для базового анализа данных.

In [2]:
import pandas as pd
gapminder_data = pd.read_csv('gapminder.csv')

Выведите информацию о первых 10 записях набора данных: 

In [5]:
gapminder_data.head(10)

Unnamed: 0,country,region,income,income_level,life_exp,co2,co2_change,population
0,Afghanistan,Asia,2.03,Level 1,62.7,0.254,increase,37.2
1,Albania,Europe,13.3,Level 3,78.4,1.59,increase,2.88
2,Algeria,Africa,11.6,Level 3,76.0,3.69,increase,42.2
3,Andorra,Europe,58.3,Level 4,82.1,6.12,decrease,0.077
4,Angola,Africa,6.93,Level 2,64.6,1.12,decrease,30.8
5,Antigua and Barbuda,Americas,21.0,Level 3,76.2,5.88,increase,0.0963
6,Argentina,Americas,22.7,Level 3,76.5,4.41,decrease,44.4
7,Armenia,Europe,12.7,Level 3,75.6,1.89,decrease,2.95
8,Australia,Asia,49.0,Level 4,82.9,16.9,decrease,24.9
9,Austria,Europe,55.3,Level 4,82.1,7.75,decrease,8.89


Выведите информацию о количестве записей и столбцов в наборе данных: 

In [6]:
gapminder_data.shape

(193, 8)

Произведите группировку данных по региону сохраните значение в переменной `region_means` содержащей усредненные значения по показателям `income`, `life_exp`, `co2`, `population`:

In [8]:
region_mean = gapminder_data.groupby('region')[['income', 'life_exp', 'co2', 'population']].mean()
region_mean

Unnamed: 0_level_0,income,life_exp,co2,population
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,5.733426,65.277778,1.213459,23.597094
Americas,18.022857,75.24,4.204057,28.597837
Asia,20.477627,72.779661,6.282186,76.264483
Europe,38.12,78.866667,6.344,18.812778


Отобразите список уникальных регионов, присутствующих в данных: 

In [9]:
gapminder_data['region'].unique()

array(['Asia', 'Europe', 'Africa', 'Americas'], dtype=object)

Выведите список стран, входящих в регион 'Europe':

In [11]:
gapminder_data[gapminder_data['region'] == 'Europe'] ['country']

1                     Albania
3                     Andorra
7                     Armenia
9                     Austria
10                 Azerbaijan
15                    Belarus
16                    Belgium
21     Bosnia and Herzegovina
25                   Bulgaria
42                    Croatia
44                     Cyprus
45             Czech Republic
46                    Denmark
55                    Estonia
59                    Finland
60                     France
63                    Georgia
64                    Germany
66                     Greece
75                    Hungary
76                    Iceland
81                    Ireland
83                      Italy
93                     Latvia
98                  Lithuania
99                 Luxembourg
105                     Malta
111                   Moldova
113                Montenegro
120               Netherlands
126           North Macedonia
127                    Norway
137                    Poland
138       

Найдите медианное значение продолжительности жизни ('life_exp') по региону 'Africa':

In [13]:
gapminder_data.loc[gapminder_data['region']=='Africa', 'life_exp'].median()

np.float64(64.7)

Найдите страну с наибольшим количеством выбросов CO2 ('co2').

In [16]:
gapminder_data.sort_values(by='co2', ascending=False)

Unnamed: 0,country,region,income,income_level,life_exp,co2,co2_change,population
139,Qatar,Asia,91.000,Level 4,76.1,38.0000,decrease,2.78
175,Trinidad and Tobago,Americas,26.300,Level 4,74.8,31.3000,decrease,1.39
90,Kuwait,Asia,50.500,Level 4,81.6,23.7000,decrease,4.14
182,United Arab Emirates,Asia,67.000,Level 4,73.7,21.4000,decrease,9.63
12,Bahrain,Asia,46.400,Level 4,77.0,19.8000,decrease,1.57
...,...,...,...,...,...,...,...,...
33,Chad,Africa,1.580,Level 1,59.9,0.0656,decrease,15.50
32,Central African Republic,Africa,0.933,Level 1,51.5,0.0651,increase,4.67
27,Burundi,Africa,0.762,Level 1,63.4,0.0467,increase,11.20
154,Somalia,Africa,1.130,Level 1,57.9,0.0466,decrease,15.00


Создайте новый столбец 'income_category', который будет содержать значения 'low', 'medium' и 'high' в зависимости от уровня дохода ('income').

In [18]:
low_limit = gapminder_data['income'].quantile(0.3)
med_limit = gapminder_data['income'].quantile(0.7)
print(low_limit, med_limit)
gapminder_data['income_category'] = gapminder_data['income'].apply(lambda x: 'low' if x < low_limit else 'medium' if x >= low_limit and x <= med_limit else 'high')
gapminder_data

5.276 23.799999999999955


Unnamed: 0,country,region,income,income_level,life_exp,co2,co2_change,population,income_category
0,Afghanistan,Asia,2.03,Level 1,62.7,0.254,increase,37.200,low
1,Albania,Europe,13.30,Level 3,78.4,1.590,increase,2.880,medium
2,Algeria,Africa,11.60,Level 3,76.0,3.690,increase,42.200,medium
3,Andorra,Europe,58.30,Level 4,82.1,6.120,decrease,0.077,high
4,Angola,Africa,6.93,Level 2,64.6,1.120,decrease,30.800,medium
...,...,...,...,...,...,...,...,...,...
188,Venezuela,Americas,12.40,Level 3,75.4,4.810,decrease,28.900,medium
189,Vietnam,Asia,7.59,Level 2,74.3,2.160,increase,95.500,medium
190,Yemen,Asia,2.66,Level 2,66.1,0.356,decrease,28.500,low
191,Zambia,Africa,3.52,Level 2,62.5,0.302,increase,17.400,low


In [20]:
import numpy as np
gapminder_data['income_category'] = np.where(
    gapminder_data['income'] < 12.8, 
    'low', 
    np.where(
        (gapminder_data['income'] >= 12.8) & (gapminder_data['income'] < 28), 
        'medium', 
        'high'
    )
)
gapminder_data

Unnamed: 0,country,region,income,income_level,life_exp,co2,co2_change,population,income_category
0,Afghanistan,Asia,2.03,Level 1,62.7,0.254,increase,37.200,low
1,Albania,Europe,13.30,Level 3,78.4,1.590,increase,2.880,medium
2,Algeria,Africa,11.60,Level 3,76.0,3.690,increase,42.200,low
3,Andorra,Europe,58.30,Level 4,82.1,6.120,decrease,0.077,high
4,Angola,Africa,6.93,Level 2,64.6,1.120,decrease,30.800,low
...,...,...,...,...,...,...,...,...,...
188,Venezuela,Americas,12.40,Level 3,75.4,4.810,decrease,28.900,low
189,Vietnam,Asia,7.59,Level 2,74.3,2.160,increase,95.500,low
190,Yemen,Asia,2.66,Level 2,66.1,0.356,decrease,28.500,low
191,Zambia,Africa,3.52,Level 2,62.5,0.302,increase,17.400,low


Посчитайте общее население (`population`) по региону `Americas`:

In [21]:
gapminder_data.loc[gapminder_data['region'] == 'Americas', 'population'].sum()

np.float64(1000.9243)

In [22]:
gapminder_data[gapminder_data['region'] == 'Americas']['population'].sum()

np.float64(1000.9243)