## Анализ набора данных с использованием `pandas`

### Загрузка данных и первичный обзор

В этом задании мы будем работать с набором данных `gapminder`, который содержит информацию о различных странах и их показателях развития. Мы будем использовать библиотеку Pandas для базового анализа данных.

In [2]:
import pandas as pd
gapminder_data = pd.read_csv('gapminder.csv')

Выведите информацию о первых 10 записях набора данных: 

In [3]:
gapminder_data.head(10)

Unnamed: 0,country,region,income,income_level,life_exp,co2,co2_change,population
0,Afghanistan,Asia,2.03,Level 1,62.7,0.254,increase,37.2
1,Albania,Europe,13.3,Level 3,78.4,1.59,increase,2.88
2,Algeria,Africa,11.6,Level 3,76.0,3.69,increase,42.2
3,Andorra,Europe,58.3,Level 4,82.1,6.12,decrease,0.077
4,Angola,Africa,6.93,Level 2,64.6,1.12,decrease,30.8
5,Antigua and Barbuda,Americas,21.0,Level 3,76.2,5.88,increase,0.0963
6,Argentina,Americas,22.7,Level 3,76.5,4.41,decrease,44.4
7,Armenia,Europe,12.7,Level 3,75.6,1.89,decrease,2.95
8,Australia,Asia,49.0,Level 4,82.9,16.9,decrease,24.9
9,Austria,Europe,55.3,Level 4,82.1,7.75,decrease,8.89


Выведите информацию о количестве записей и столбцов в наборе данных: 

In [4]:
gapminder_data.shape

(193, 8)

In [5]:
print('Строки',len(gapminder_data),'Колонки', len(gapminder_data.columns))

Строки 193 Колонки 8


Произведите группировку данных по региону сохраните значение в переменной `region_means` содержащей усредненные значения по показателям `income`, `life_exp`, `co2`, `population`:

In [6]:
gapminder_data.groupby('region')['income'].mean()

region
Africa       5.733426
Americas    18.022857
Asia        20.477627
Europe      38.120000
Name: income, dtype: float64

In [7]:
gapminder_data.groupby('region')[['income', 'life_exp', 'co2', 'population']].mean()

Unnamed: 0_level_0,income,life_exp,co2,population
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,5.733426,65.277778,1.213459,23.597094
Americas,18.022857,75.24,4.204057,28.597837
Asia,20.477627,72.779661,6.282186,76.264483
Europe,38.12,78.866667,6.344,18.812778


Отобразите список уникальных регионов, присутствующих в данных: 

In [8]:
gapminder_data['region'].unique()

array(['Asia', 'Europe', 'Africa', 'Americas'], dtype=object)

Выведите список стран, входящих в регион 'Europe':

In [9]:
gapminder_data[gapminder_data['region'] == 'Europe']['country'].unique()

array(['Albania', 'Andorra', 'Armenia', 'Austria', 'Azerbaijan',
       'Belarus', 'Belgium', 'Bosnia and Herzegovina', 'Bulgaria',
       'Croatia', 'Cyprus', 'Czech Republic', 'Denmark', 'Estonia',
       'Finland', 'France', 'Georgia', 'Germany', 'Greece', 'Hungary',
       'Iceland', 'Ireland', 'Italy', 'Latvia', 'Lithuania', 'Luxembourg',
       'Malta', 'Moldova', 'Montenegro', 'Netherlands', 'North Macedonia',
       'Norway', 'Poland', 'Portugal', 'Romania', 'Russia', 'Serbia',
       'Slovak Republic', 'Slovenia', 'Spain', 'Sweden', 'Switzerland',
       'Turkey', 'Ukraine', 'United Kingdom'], dtype=object)

Найдите медианное значение продолжительности жизни ('life_exp') по региону 'Africa':

In [10]:
gapminder_data[gapminder_data['region'] == 'Africa'].life_exp.median()


64.7

Найдите страну с наибольшим количеством выбросов CO2 ('co2').

In [14]:
gapminder_data[gapminder_data['co2'] == gapminder_data['co2'].max()]['country']

139    Qatar
Name: country, dtype: object

In [15]:
gapminder_data.loc[gapminder_data['co2'].idxmax(), 'country']

'Qatar'

Создайте новый столбец 'income_category', который будет содержать значения 'low', 'medium' и 'high' в зависимости от уровня дохода ('income').

In [17]:
low = gapminder_data['income'].quantile([1/3]).values[0]
high = gapminder_data['income'].quantile([2/3]).values[0]
gapminder_data.loc[gapminder_data.income < low,'income_category'] = 'low'
gapminder_data.loc[gapminder_data.income > high,'income_category'] = 'high'
gapminder_data.loc[((gapminder_data.income > low) & (gapminder_data.income < high)),'income_category'] = 'medium'
gapminder_data.sample(40).sort_values('income')


Unnamed: 0,country,region,income,income_level,life_exp,co2,co2_change,population,income_category
154,Somalia,Africa,1.13,Level 1,57.9,0.0466,decrease,15.0,low
157,South Sudan,Africa,2.01,Level 1,63.0,0.171,increase,11.0,low
173,Togo,Africa,2.06,Level 1,64.5,0.434,increase,7.89,low
142,Rwanda,Africa,2.09,Level 1,68.3,0.0913,increase,12.3,low
89,Kiribati,Asia,2.29,Level 1,59.3,0.601,decrease,0.116,low
170,Tanzania,Africa,2.59,Level 1,66.7,0.222,increase,56.3,low
146,Senegal,Africa,3.31,Level 2,68.0,0.739,increase,15.9,low
169,Tajikistan,Asia,3.42,Level 2,69.2,0.603,increase,9.1,low
192,Zimbabwe,Africa,3.92,Level 2,60.6,0.85,increase,14.4,low
28,Cambodia,Asia,4.16,Level 2,69.7,0.64,increase,16.2,low


In [18]:
q_low = gapminder_data['income'].quantile(0.38)
q_hight = gapminder_data['income'].quantile(0.62)

gapminder_data['income_category'] = 'medium'
gapminder_data.loc[gapminder_data['income'] < q_low, 'income_category'] = 'low'
gapminder_data.loc[gapminder_data['income'] > q_hight, 'income_category'] = 'high'
display(gapminder_data)

Unnamed: 0,country,region,income,income_level,life_exp,co2,co2_change,population,income_category
0,Afghanistan,Asia,2.03,Level 1,62.7,0.254,increase,37.200,low
1,Albania,Europe,13.30,Level 3,78.4,1.590,increase,2.880,medium
2,Algeria,Africa,11.60,Level 3,76.0,3.690,increase,42.200,medium
3,Andorra,Europe,58.30,Level 4,82.1,6.120,decrease,0.077,high
4,Angola,Africa,6.93,Level 2,64.6,1.120,decrease,30.800,low
...,...,...,...,...,...,...,...,...,...
188,Venezuela,Americas,12.40,Level 3,75.4,4.810,decrease,28.900,medium
189,Vietnam,Asia,7.59,Level 2,74.3,2.160,increase,95.500,low
190,Yemen,Asia,2.66,Level 2,66.1,0.356,decrease,28.500,low
191,Zambia,Africa,3.52,Level 2,62.5,0.302,increase,17.400,low


In [19]:
gapminder_data['income_category'] = pd.cut(gapminder_data['income'],
    bins=[0, 30, 70, 100],
    labels=['low', 'medium', 'high'],
    include_lowest=True)

In [20]:
gapminder_data

Unnamed: 0,country,region,income,income_level,life_exp,co2,co2_change,population,income_category
0,Afghanistan,Asia,2.03,Level 1,62.7,0.254,increase,37.200,low
1,Albania,Europe,13.30,Level 3,78.4,1.590,increase,2.880,low
2,Algeria,Africa,11.60,Level 3,76.0,3.690,increase,42.200,low
3,Andorra,Europe,58.30,Level 4,82.1,6.120,decrease,0.077,medium
4,Angola,Africa,6.93,Level 2,64.6,1.120,decrease,30.800,low
...,...,...,...,...,...,...,...,...,...
188,Venezuela,Americas,12.40,Level 3,75.4,4.810,decrease,28.900,low
189,Vietnam,Asia,7.59,Level 2,74.3,2.160,increase,95.500,low
190,Yemen,Asia,2.66,Level 2,66.1,0.356,decrease,28.500,low
191,Zambia,Africa,3.52,Level 2,62.5,0.302,increase,17.400,low


Посчитайте общее население (`population`) по региону `Americas`:

In [21]:
gapminder_data[gapminder_data['region'] == 'Americas']['population'].sum()

1000.9243