In [13]:
import pandas as pd
import numpy as np

## Series

Она представляет из себя объект, похожий на одномерный массив, но отличительной чертой является наличие индексов. Индекс находится слева, а сам элемент справа.

### Синтаксис создания:

- pandas.Series(input_data, index, data_type)
- input_data: ввод в виде списка, константы, массива NumPy, Dict и т. д.
- index: значения индексов.
- data_type (опционально): тип данных.

In [14]:
a = pd.Series([4, 7, 6, 3, 9],
              index=['one', 'two', 'three', 'four', 'five'])
a

one      4
two      7
three    6
four     3
five     9
dtype: int64

In [15]:
a = pd.Series([4, 7, 6, 3, 9])
a

0    4
1    7
2    6
3    3
4    9
dtype: int64

In [16]:
a.index

RangeIndex(start=0, stop=5, step=1)

In [17]:
a.values

array([4, 7, 6, 3, 9], dtype=int64)

In [18]:
a[0]

4

In [19]:
a[1]

7

In [20]:
a['one']

KeyError: 'one'

## DataFrame

Объект DataFrame является табличной структурой данных. В любой таблице всегда присутствуют строки и столбцы. При этом в столбцах можно хранить данные разных типов данных. Столбцами в объекте DataFrame выступают объекты Series, строки которых являются их элементами.

## Синтаксис создания:

### pandas.DataFrame(input_data, index)

- input_data: ввод в виде Dict, 2D массива NumPy, Series и т. д.
- index: значения индексов.

In [None]:
df = pd.DataFrame({
    'Age': [46, 37, 44, 42, 42],
    'Country': ['Spain', 'Spain', 'Germany', 'Germany', 'France'],
    'Gender': ['Female', 'Female', 'Male', 'Male', 'Male']
})
df

In [None]:
df['Age']

In [None]:
df.Country

In [None]:
df[['Country', 'Age']]

In [None]:
df.columns

In [None]:
df.index

In [None]:
df = pd.DataFrame({
    'Age': [46, 37, 44, 42, 42],
    'Country': ['Spain', 'Spain', 'Germany', 'Germany', 'France'],
    'Gender': ['Female', 'Female', 'Male', 'Male', 'Male']
}, index=[5, 4, 6, 3, 2])

df

In [None]:
df.index = [101, 102, 103, 104, 105]
df

## Считывание данных

В целом, pandas поддерживает все самые популярные форматы хранения данных: csv, excel, sql, html и многое другое, но чаще всего приходится работать именно с csv файлами (comma separated values).

Будем работать с датасетом по оттоку клиентов из банка https://www.kaggle.com/datasets/shubh0799/churn-modelling.

#### Характеристики каждого клиента:

1. RowNumber - Номер строки
2. CustomerId - Уникальный идентификатор клиента
3. Surname - Фамилия клиента
4. CreditScore - Кредитная оценка клиента
5. Geography - Из какой страны клиент
6. Gender - Пол клиента
7. Age - Возраст клиента
8. Tenure - Сколько лет человек является клиентом банка
9. Balance - Баланс счета
10. NumOfProducts - Количество открытых продуктов
11. HasCrCard - Есть ли у клиента кредитная карта
12. IsActiveMember - Является ли клиент активные участником
13. EstimatedSalary - Предположительная зарплата клиента
14. Exited - Уйдет ли человек в отток

In [None]:
df = pd.read_csv('./Churn_Modelling.csv')
df

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [None]:
pd.read_csv('./Churn_Modelling.csv', header=1)

In [None]:
pd.read_csv('./Churn_Modelling.csv', sep=',')

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [None]:
df

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [None]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [None]:
df.tail()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.0,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.0,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1
9999,10000,15628319,Walker,792,France,Female,28,4,130142.79,1,1,0,38190.78,0


In [None]:
df.sample()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
2402,2403,15646539,Liao,531,France,Male,31,3,96288.26,1,1,0,56794.73,0


In [None]:
df.sample(frac=1)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
6892,6893,15705009,Cartwright,649,France,Female,56,8,156974.26,1,1,0,89405.26,1
2902,2903,15804002,Kovalev,691,France,Female,33,1,128306.83,1,1,1,113580.79,0
6887,6888,15662641,Amadi,850,France,Male,19,8,0.00,1,1,1,68569.89,0
6844,6845,15601627,Siciliano,587,France,Male,33,8,148163.57,1,0,0,122925.40,0
6628,6629,15584967,Chiganu,596,Spain,Male,57,6,0.00,2,1,1,72402.00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3915,3916,15672246,Jefferies,686,Germany,Male,43,2,134896.03,1,1,1,97847.05,0
1717,1718,15684269,Gray,707,Spain,Female,35,3,56674.48,1,1,0,17987.40,1
1059,1060,15812197,Kline,850,France,Male,38,7,80293.98,1,0,0,126555.74,0
3509,3510,15729958,Wilkinson,777,France,Male,37,1,0.00,1,1,1,126837.72,0


In [22]:
df.sample(frac=0.5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
5920,5921,15640648,Howe,698,France,Male,36,6,0.00,2,0,1,19231.98,0
4012,4013,15726814,Walton,554,Spain,Male,46,4,0.00,2,0,1,57320.92,0
5745,5746,15623649,Ogle,629,Spain,Male,32,3,0.00,2,1,1,15404.64,0
6101,6102,15588695,Su,833,Spain,Male,32,6,0.00,1,1,1,44323.22,1
9091,9092,15630195,Johnstone,745,France,Female,40,6,131184.67,1,1,1,49815.62,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1522,1523,15720662,Sholes,787,France,Female,35,1,106266.80,1,1,1,16607.15,0
4704,4705,15754113,Li,588,France,Female,35,0,0.00,2,1,1,155485.24,0
6388,6389,15582033,Manfrin,753,Germany,Male,44,3,138076.47,1,1,0,15523.09,1
3546,3547,15653753,Chiemenam,542,Spain,Male,43,6,113567.94,1,1,0,89543.25,0


In [23]:
df.shape

(10000, 14)

## Первичный анализ данных
#### Типы данных:

- int: целочисленные значения. Пример: 9, 56, 30
- float: вещественные значения (с плавающей точкой). Пример: 7.3, 9.0, 45.334
- object/str: строковые значения. Пример: ‘hello, world’, ‘50 000’

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


#### Выводятся значения:

- Count - количество непропущенных объектов (там, где нет nan значений)
- mean - арифметическое среднее
- std - стандартное отклонение
- min - минимальное значение
- 25% - квантиль 25 процентов
- 50% - квантиль 50 процентов или же медиана
- 75% - квантиль 75 процентов
- max - максимальное значение

In [24]:
df.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [25]:
df['Age'].min()

18

In [26]:
df['Balance'].max()

250898.09

In [27]:
df[['CreditScore', 'Age', 'Tenure']].mean()

CreditScore    650.5288
Age             38.9218
Tenure           5.0128
dtype: float64

#### Получаем 4 значения:

- count - количество непропущенных объектов
- unique - количество уникальных значений
- top - самое частотное значение (мода)
- freq - частота появления самого частотного значения

In [28]:
df.describe(include=['object'])

Unnamed: 0,Surname,Geography,Gender
count,10000,10000,10000
unique,2932,3,2
top,Smith,France,Male
freq,32,5014,5457


In [29]:
df.dtypes

RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

In [30]:
df['Age'].dtype

dtype('int64')

In [31]:
df['HasCrCard'].astype('bool')

0        True
1       False
2        True
3       False
4        True
        ...  
9995     True
9996     True
9997    False
9998     True
9999     True
Name: HasCrCard, Length: 10000, dtype: bool

In [32]:
df['HasCrCard'] = df['HasCrCard'].astype('bool')

In [33]:
df['HasCrCard'].dtype

dtype('bool')

In [34]:
df['Geography'].unique()

array(['France', 'Spain', 'Germany'], dtype=object)

In [35]:
df['Geography'].nunique()

3

In [36]:
df['Geography'].value_counts()

Geography
France     5014
Germany    2509
Spain      2477
Name: count, dtype: int64

In [37]:
df['Geography'].value_counts(normalize=True)

Geography
France     0.5014
Germany    0.2509
Spain      0.2477
Name: proportion, dtype: float64

### Фильтрация
#### Фильтрация в pandas основывается на булевых масках.

##### Булевая маска — бинарные данные, которые используются для выбора определенных объектов из структуры данных.

In [38]:
df['Gender'] == 'Male'

0       False
1       False
2       False
3       False
4       False
        ...  
9995     True
9996     True
9997    False
9998     True
9999    False
Name: Gender, Length: 10000, dtype: bool

In [39]:
male = df[df['Gender'] == 'Male']
male

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
5,6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,True,0,149756.71,1
6,7,15592531,Bartlett,822,France,Male,50,7,0.00,2,True,1,10062.80,0
8,9,15792365,He,501,France,Male,44,4,142051.07,2,False,1,74940.50,0
9,10,15592389,H?,684,France,Male,27,2,134603.88,1,True,1,71725.73,0
10,11,15767821,Bearce,528,France,Male,31,6,102016.72,2,False,0,80181.12,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9992,9993,15657105,Chukwualuka,726,Spain,Male,36,2,0.00,1,True,0,195192.40,0
9993,9994,15569266,Rahman,644,France,Male,28,7,155060.41,1,True,0,29179.52,0
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,True,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,True,1,101699.77,0


## Логические И
##### При операторе & нужно, чтобы выполнялось два условия одновременно:

In [40]:
df[(df['Gender'] == 'Female') & (df['NumOfProducts'] >= 3)]

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,True,0,113931.57,1
7,8,15656148,Obinna,376,Germany,Female,29,4,115046.74,4,True,0,119346.88,1
30,31,15589475,Azikiwe,591,Spain,Female,39,3,0.00,3,True,0,140469.38,1
88,89,15622897,Sharpe,646,France,Female,46,4,0.00,3,True,0,93251.42,1
90,91,15757535,Heap,647,Spain,Female,44,5,0.00,3,True,1,174205.22,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9565,9566,15752294,Long,582,France,Female,38,9,135979.01,4,True,1,76582.95,1
9747,9748,15775761,Iweobiegbunam,610,Germany,Female,69,5,86038.21,3,False,0,192743.06,1
9800,9801,15640507,Li,762,Spain,Female,35,3,119349.69,3,True,1,47114.18,1
9877,9878,15572182,Onwuamaeze,505,Germany,Female,33,3,106506.77,3,True,0,45445.78,1


## Логические ИЛИ
##### При операторе | нужно, чтобы выполнялось хотя бы одно условие:

In [41]:
df[(df['HasCrCard']) | (df['NumOfProducts'] >= 3)]

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,True,1,101348.88,1
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,True,0,113931.57,1
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,True,1,79084.10,0
5,6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,True,0,149756.71,1
6,7,15592531,Bartlett,822,France,Male,50,7,0.00,2,True,1,10062.80,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9993,9994,15569266,Rahman,644,France,Male,28,7,155060.41,1,True,0,29179.52,0
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,True,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,True,1,101699.77,0
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,True,0,92888.52,1


## Логические НЕ
#### При операторе ~ булевая маска обращается: True меняется на False и наоборот:

In [42]:
df[~(df['Geography'] == 'Spain')]

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,True,1,101348.88,1
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,True,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,False,0,93826.63,0
6,7,15592531,Bartlett,822,France,Male,50,7,0.00,2,True,1,10062.80,0
7,8,15656148,Obinna,376,Germany,Female,29,4,115046.74,4,True,0,119346.88,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,True,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,True,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,False,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,True,0,92888.52,1


In [43]:
df[df['Geography'].isin(['France', 'Germany'])]

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,True,1,101348.88,1
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,True,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,False,0,93826.63,0
6,7,15592531,Bartlett,822,France,Male,50,7,0.00,2,True,1,10062.80,0
7,8,15656148,Obinna,376,Germany,Female,29,4,115046.74,4,True,0,119346.88,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,True,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,True,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,False,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,True,0,92888.52,1


## Индексация

In [44]:
df_small = df[(df['Geography'] == 'Spain')][['Geography', 'Gender', 'Age']]
df_small.head()

Unnamed: 0,Geography,Gender,Age
1,Spain,Female,41
4,Spain,Female,43
5,Spain,Male,44
11,Spain,Male,24
14,Spain,Female,35


## loc

In [45]:
df_small.loc[1]

Geography     Spain
Gender       Female
Age              41
Name: 1, dtype: object

In [None]:
df_small.loc[3]

In [47]:
df_small.loc[[1, 4, 5], ['Gender', 'Age']]

Unnamed: 0,Gender,Age
1,Female,41
4,Female,43
5,Male,44


## iloc

In [48]:
df_small.head()

Unnamed: 0,Geography,Gender,Age
1,Spain,Female,41
4,Spain,Female,43
5,Spain,Male,44
11,Spain,Male,24
14,Spain,Female,35


In [49]:
df_small.iloc[[0, 1, 2]]

Unnamed: 0,Geography,Gender,Age
1,Spain,Female,41
4,Spain,Female,43
5,Spain,Male,44


In [None]:
df_small.iloc[2500]

In [50]:
df_small.iloc[0, [0, 2]]

Geography    Spain
Age             41
Name: 1, dtype: object

## Сортировки

In [51]:
df.sort_values('Age')

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
3512,3513,15657779,Boylan,806,Spain,Male,18,3,0.00,2,True,1,86994.54,0
1678,1679,15569178,Kharlamov,570,France,Female,18,4,82767.42,1,True,0,71811.90,0
3517,3518,15757821,Burgess,771,Spain,Male,18,1,0.00,2,False,0,41542.95,0
9520,9521,15673180,Onyekaozulu,727,Germany,Female,18,2,93816.70,2,True,0,126172.11,0
2021,2022,15795519,Vasiliev,716,Germany,Female,18,3,128743.80,1,False,0,197322.13,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3387,3388,15798024,Lori,537,Germany,Male,84,8,92242.34,1,True,1,186235.98,0
3033,3034,15578006,Yao,787,France,Female,85,10,0.00,2,True,1,116537.96,0
2458,2459,15813303,Rearick,513,Spain,Male,88,10,0.00,2,True,1,52952.24,0
6759,6760,15660878,T'ien,705,France,Male,92,1,126076.24,2,True,1,34436.83,0


In [52]:
df.sort_values('Age', ascending=False)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
6443,6444,15764927,Rogova,753,France,Male,92,3,121513.31,1,False,1,195563.99,0
6759,6760,15660878,T'ien,705,France,Male,92,1,126076.24,2,True,1,34436.83,0
2458,2459,15813303,Rearick,513,Spain,Male,88,10,0.00,2,True,1,52952.24,0
3033,3034,15578006,Yao,787,France,Female,85,10,0.00,2,True,1,116537.96,0
3387,3388,15798024,Lori,537,Germany,Male,84,8,92242.34,1,True,1,186235.98,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9782,9783,15728829,Weigel,509,France,Male,18,7,102983.91,1,True,0,171770.58,0
2141,2142,15758372,Wallace,674,France,Male,18,7,0.00,2,True,1,55753.12,1
9501,9502,15634146,Hou,835,Germany,Male,18,2,142872.36,1,True,1,117632.63,0
9520,9521,15673180,Onyekaozulu,727,Germany,Female,18,2,93816.70,2,True,0,126172.11,0


In [53]:
df.sort_values(['Age', 'CreditScore'])

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
9782,9783,15728829,Weigel,509,France,Male,18,7,102983.91,1,True,0,171770.58,0
1678,1679,15569178,Kharlamov,570,France,Female,18,4,82767.42,1,True,0,71811.90,0
9029,9030,15722701,Bruno,594,Germany,Male,18,1,132694.73,1,True,0,167689.56,0
7334,7335,15759133,Vaguine,616,France,Male,18,6,0.00,2,True,1,27308.58,0
9526,9527,15665521,Chiazagomekpele,642,Germany,Male,18,5,111183.53,2,False,1,10063.75,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3387,3388,15798024,Lori,537,Germany,Male,84,8,92242.34,1,True,1,186235.98,0
3033,3034,15578006,Yao,787,France,Female,85,10,0.00,2,True,1,116537.96,0
2458,2459,15813303,Rearick,513,Spain,Male,88,10,0.00,2,True,1,52952.24,0
6759,6760,15660878,T'ien,705,France,Male,92,1,126076.24,2,True,1,34436.83,0


### Seminar

###  1. Скачать данные по ссылке https://www.kaggle.com/datasets/ionaskel/laptop-prices
     2. Считать данные с помощью pandas
     3. Вывести на экран первые 5 строк
     4. Посмотреть на описание признаков и на их содержани

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('./laptop_price.csv',encoding='latin1')
df 

In [None]:
df.head

### Задача 2

- Изучить типы данных
- Кол-во пропущенных ячеек в данных
- Посчитайте основные статистики по всем признакам и поизучайте их

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe(include=['object','float64'])

### Задача 3
#### 3.1 Ноутбуков какой компании дольше всего в наборе данных

In [None]:
x = df['Company'].value_counts()
x

#### 3.2 Какая минимальная и максимальная стоимость у ноутбуков в наборе данных ?

In [None]:
df['Price_euros'].min(),df['Price_euros'].max()

#### 3.3 Какой ноутбук самый дорогой  данных  ?
##### Выведите все характеристики только по этому ноутбуку
##### Если таких ноутбуков несколько , то выводите их все

In [None]:
df.sort_values('Price_euros',ascending=False).head(1)

In [None]:
df.sort_values('Price_euros',ascending=False).iloc[0]

### Задача 4 
#### Задача 4.1 Найдите ноутбук с самой маленькой диогональю
 выведите следующие харатериситики:
 1. Компания
 2. Тип ноутбука
 3. Диагональ
 4. Стоимость
  
 если таких несколько выведите их все


In [None]:
df [df['Inches'] == df['Inches'].min()]

In [None]:
df [df['Inches'] == df['Inches'].min()][['Company','TypeName','Inches','Price_euros']]

### 4.2 Сколько стоит самый дорогой ноутбук у компании HP ?

In [None]:
df[df['Company'] == 'HP'].sort_values('Price_euros',ascending=False).head(1)

In [None]:
df[df['Company'] == 'HP'].sort_values('Price_euros',ascending=False).iloc[0]

### 4.3 Как много ноутбуков Ultrabook с 8GM RAM ?
Найдите сколько таких  ноутбуков с 8GM ОЗУ в процентном соотношении относительно всех ноутбуков 

In [None]:
u8 = df[(df['TypeName']=='Ultrabook')&(df['Ram']=='8GB')]
u8

In [None]:
u8 = df[(df['TypeName']=='Ultrabook')&(df['Ram']=='8GB')].shape[0]
u8

In [None]:
u8 = df[(df['TypeName']=='Ultrabook')&(df['Ram']=='8GB')].shape[0]
u = df[df['TypeName']=='Ultrabook'].shape[0]
u8/u

### Задача 5
#### Задача 5.1 Выберите ноутбук клиенту
Клиент хочет подобрать ноутбук с 8GB или 16GB ОЗУ на Windows 10 в стоимости до 500 евро, сколько у него вариантов?

In [None]:
df[(df['OpSys'] == 'Windows 10') & (df['Ram'].isin (['8GB','16GB'])) & (df['Price_euros']< 500)].shape[0]

### 5.2 Выберите ноутбук клиенту
Клиент хочет подобрать ноутбук от MSI, с видеокартой Nvidia GeForce GTX 1050 Ti и главное не с диагональю 15.6. В какой ценовой категории вышли подобные ноутбуки?

In [None]:
x = df[(df['Company'] =='MSI') & (df['Gpu'] == 'Nvidia GeForce GTX 1050 Ti') & (df['Inches']!= 15.6)]
x

In [None]:
x['Price_euros'].min(),x['Price_euros'].max()

### 5.3 Что дешевле?
В среднем дешевле ноутбуки с CPU Intel Core i7 7700HQ 2.8GHz или с Intel Core i7 7600U 2.8GHz?

In [None]:
df[df['Cpu'] == 'Intel Core i7 7700HQ 2.8GHz']['Price_euros'].mean()

In [None]:
df[df['Cpu'] == 'Intel Core i7 7600U 2.8GHz']['Price_euros'].mean()

## 6 задача
Найдите самый легкий ноутбук
Но обратите внимание на тип и представление данных в признаке Weight, если что, замените в строке 'kg' на пустую строку через метод .str.replace()

In [None]:
df['Weight'].min()

In [None]:
df['Weight'] = df['Weight'].str.replace('kg',' ')
df['Weight']

In [None]:
df['Weight'] = df['Weight'].astype('float')
df['Weight']

In [None]:
df['Weight'].min()