# **Polars**

In [1]:
import polars as pl

## Чтение из файла train.csv

In [2]:
df_polars = pl.read_csv("train.csv")

In [3]:
df_polars.head(10)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Ow…","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. …","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Mis…","""female""",26.0,0,0,"""STON/O2. 31012…",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs.…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. Wil…","""male""",35.0,0,0,"""373450""",8.05,,"""S"""
6,0,3,"""Moran, Mr. Jam…","""male""",,0,0,"""330877""",8.4583,,"""Q"""
7,0,1,"""McCarthy, Mr. …","""male""",54.0,0,0,"""17463""",51.8625,"""E46""","""S"""
8,0,3,"""Palsson, Maste…","""male""",2.0,3,1,"""349909""",21.075,,"""S"""
9,1,3,"""Johnson, Mrs. …","""female""",27.0,0,2,"""347742""",11.1333,,"""S"""
10,1,2,"""Nasser, Mrs. N…","""female""",14.0,1,0,"""237736""",30.0708,,"""C"""


## Основная информация о датасете

In [4]:
df_polars.shape # Размер датасета

(891, 12)

In [5]:
df_polars.dtypes # Типы данных

[Int64,
 Int64,
 Int64,
 Utf8,
 Utf8,
 Float64,
 Int64,
 Int64,
 Utf8,
 Float64,
 Utf8,
 Utf8]

In [6]:
df_polars.estimated_size() # Объём данных в байтах

122102

In [7]:
df_polars.null_count() # Количество пропусков

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,177,0,0,0,0,687,2


In [8]:
df_polars.select(pl.mean("Sex")) # Средние значения пола

Sex
str
""


In [9]:
df_polars.select(pl.mean("Age")) # Средние значения возраста

Age
f64
29.699118


In [10]:
df_polars.select(pl.mean("Fare")) # Средние значения цены

Fare
f64
32.204208


## Количество пассажиров каждого класса

In [11]:
df_polars.group_by("Pclass").agg(pl.col("PassengerId").unique().count())

Pclass,PassengerId
i64,u32
2,184
1,216
3,491


## Количество выживших

> мужчин:


In [12]:
df_polars.filter((pl.col('Sex') == "male") & (pl.col('Survived') == 1)).sum()["Survived"]

Survived
i64
109


> женщин:

In [13]:
df_polars.filter((pl.col('Sex') == "female") & (pl.col('Survived') == 1)).sum()["Survived"]

Survived
i64
233


## Данные о пассажирах, возраст которых больше 44 лет

In [14]:
df_polars.filter(pl.col('Age') > 44)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
7,0,1,"""McCarthy, Mr. …","""male""",54.0,0,0,"""17463""",51.8625,"""E46""","""S"""
12,1,1,"""Bonnell, Miss.…","""female""",58.0,0,0,"""113783""",26.55,"""C103""","""S"""
16,1,2,"""Hewlett, Mrs. …","""female""",55.0,0,0,"""248706""",16.0,,"""S"""
34,0,2,"""Wheadon, Mr. E…","""male""",66.0,0,0,"""C.A. 24579""",10.5,,"""S"""
53,1,1,"""Harper, Mrs. H…","""female""",49.0,1,0,"""PC 17572""",76.7292,"""D33""","""C"""
55,0,1,"""Ostby, Mr. Eng…","""male""",65.0,0,1,"""113509""",61.9792,"""B30""","""C"""
63,0,1,"""Harris, Mr. He…","""male""",45.0,1,0,"""36973""",83.475,"""C83""","""S"""
93,0,1,"""Chaffee, Mr. H…","""male""",46.0,1,0,"""W.E.P. 5734""",61.175,"""E31""","""S"""
95,0,3,"""Coxon, Mr. Dan…","""male""",59.0,0,0,"""364500""",7.25,,"""S"""
97,0,1,"""Goldschmidt, M…","""male""",71.0,0,0,"""PC 17754""",34.6542,"""A5""","""C"""


# **Ускорение работы с pandas**

In [15]:
import pandas as pd

## Чтение из файла train.csv

In [16]:
df_pandas = pd.read_csv("train.csv")

## Cредний возраст пассажиров и его стандартное отклонение (исп. bottleneck)

In [17]:
!pip install bottleneck
import bottleneck as bn

Collecting bottleneck
  Downloading Bottleneck-1.3.7-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (354 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m354.0/354.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: bottleneck
Successfully installed bottleneck-1.3.7


In [18]:
avg_age = bn.move_mean(df_pandas["Age"], window = 3)

In [19]:
avg_age # Средний возраст

array([        nan,         nan, 28.66666667, 33.        , 32.        ,
               nan,         nan,         nan, 27.66666667, 14.33333333,
       15.        , 25.33333333, 27.33333333, 39.        , 24.33333333,
       36.        , 23.66666667,         nan,         nan,         nan,
               nan,         nan, 28.        , 25.66666667, 17.        ,
       24.66666667,         nan,         nan,         nan,         nan,
               nan,         nan,         nan,         nan,         nan,
       45.33333333,         nan,         nan,         nan, 17.66666667,
       24.        , 27.        ,         nan,         nan,         nan,
               nan,         nan,         nan,         nan,         nan,
               nan, 15.33333333, 25.66666667, 33.        , 47.66666667,
               nan,         nan,         nan, 18.16666667, 14.83333333,
       12.66666667, 23.66666667, 35.        , 29.        ,         nan,
               nan,         nan,         nan, 21.66666667, 20.66

In [20]:
std_deviation = bn.nanstd(df_pandas["Age"], ddof=0)

In [21]:
std_deviation # Стандартное отклонение

14.516321150817317

## Увеличение стоимости билетов — Fare_new

In [22]:
np_arr = df_pandas.to_numpy()

> Стоимость в начале:

In [23]:
df_pandas[['Pclass', 'Fare']]

Unnamed: 0,Pclass,Fare
0,3,7.2500
1,1,71.2833
2,3,7.9250
3,1,53.1000
4,3,8.0500
...,...,...
886,2,13.0000
887,1,30.0000
888,3,23.4500
889,1,30.0000


> Увеличение в зависимости от класса каюты:

In [24]:
res = [row[9] * 1.3 if (row[2] == 1 or row[2] == 2) else row[9] * 1.1 for row in np_arr]

> Добавление нового столбца:

In [25]:
df_pandas.insert(10, 'Fare_new', res)

In [26]:
df_pandas[['Pclass', 'Fare', 'Fare_new']]

Unnamed: 0,Pclass,Fare,Fare_new
0,3,7.2500,7.97500
1,1,71.2833,92.66829
2,3,7.9250,8.71750
3,1,53.1000,69.03000
4,3,8.0500,8.85500
...,...,...,...
886,2,13.0000,16.90000
887,1,30.0000,39.00000
888,3,23.4500,25.79500
889,1,30.0000,39.00000


# **Оптимизация типов pandas**

## Чтение из файла Housing.csv



In [27]:
df_housing = pd.read_csv("Housing.csv")

In [28]:
df_housing

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,yes,no,yes,no,no,2,no,unfurnished
541,1767150,2400,3,1,1,no,no,no,no,no,0,no,semi-furnished
542,1750000,3620,2,1,1,yes,no,no,no,no,0,no,unfurnished
543,1750000,2910,3,1,1,no,no,no,no,no,0,no,furnished


## Оптимальные с точки зрения потребления памяти данные

In [29]:
df_housing.dtypes

price                int64
area                 int64
bedrooms             int64
bathrooms            int64
stories              int64
mainroad            object
guestroom           object
basement            object
hotwaterheating     object
airconditioning     object
parking              int64
prefarea            object
furnishingstatus    object
dtype: object

> **price**, **area** — *int64*, оптимальный тип, т.к. числа большие и могут заметно отличаться;

> **bedrooms**, **bathrooms**, **stories**, **parking** — *int64*, неоптимальный тип, т.к. числа маленькие повторяются.

> Подходящий тип — *category*;

> **mainroad**, **guestroom**, **basement**, **hotwaterheating**, **airconditioning**, **prefarea** — *object*, неоптимальный тип, т.к. эти значения говорят не о количествах или признаках, а о наличии/отсутствии (True/False).

> Подходящий тип — *bool*.

> **furnishingstatus** — *object*, оптимальный тип, т.к. описывает признак и может иметь необычные значения различной длины.

## Изменение типов столбцов + сравнение по памяти

> ***int64*** → ***category***



In [33]:
df_housing[['bedrooms', 'bathrooms', 'stories', 'parking']].info(memory_usage=bool)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   bedrooms   545 non-null    int64
 1   bathrooms  545 non-null    int64
 2   stories    545 non-null    int64
 3   parking    545 non-null    int64
dtypes: int64(4)
memory usage: 17.2 KB


In [34]:
df_housing[['bedrooms', 'bathrooms', 'stories', 'parking']].astype('category').info(memory_usage=bool)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   bedrooms   545 non-null    category
 1   bathrooms  545 non-null    category
 2   stories    545 non-null    category
 3   parking    545 non-null    category
dtypes: category(4)
memory usage: 3.1 KB


> ***object*** → ***bool***

In [35]:
df_housing[['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']].info(memory_usage=bool)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   mainroad         545 non-null    object
 1   guestroom        545 non-null    object
 2   basement         545 non-null    object
 3   hotwaterheating  545 non-null    object
 4   airconditioning  545 non-null    object
 5   prefarea         545 non-null    object
dtypes: object(6)
memory usage: 25.7+ KB


In [36]:
df_housing[['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']].astype('bool').info(memory_usage=bool)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   mainroad         545 non-null    bool 
 1   guestroom        545 non-null    bool 
 2   basement         545 non-null    bool 
 3   hotwaterheating  545 non-null    bool 
 4   airconditioning  545 non-null    bool 
 5   prefarea         545 non-null    bool 
dtypes: bool(6)
memory usage: 3.3 KB
