# Polars (первая часть)

Импорт модуля

In [1]:
import polars as pl

Чтение датасета

In [2]:
df = pl.read_csv('train.csv')

Информация о датасете

In [3]:
df.to_pandas().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Создание описательных статистик

In [4]:
df.to_pandas().describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


Количество пассажиров каждого класса

In [5]:
df.group_by("Pclass").agg(pl.col("PassengerId").count()).sort('Pclass').rename({'PassengerId': 'Count'})

Pclass,Count
i64,u32
1,216
2,184
3,491


Количество выживших мужчин и женщин

In [6]:
df.filter(pl.col('Survived') == 1).group_by("Sex").agg(pl.col("PassengerId").count()).rename({'PassengerId': 'Count'})

Sex,Count
str,u32
"""male""",109
"""female""",233


Пассажиры старше 44 лет

In [7]:
df.filter(pl.col('Age') > 44)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
7,0,1,"""McCarthy, Mr. …","""male""",54.0,0,0,"""17463""",51.8625,"""E46""","""S"""
12,1,1,"""Bonnell, Miss.…","""female""",58.0,0,0,"""113783""",26.55,"""C103""","""S"""
16,1,2,"""Hewlett, Mrs. …","""female""",55.0,0,0,"""248706""",16.0,,"""S"""
34,0,2,"""Wheadon, Mr. E…","""male""",66.0,0,0,"""C.A. 24579""",10.5,,"""S"""
53,1,1,"""Harper, Mrs. H…","""female""",49.0,1,0,"""PC 17572""",76.7292,"""D33""","""C"""
55,0,1,"""Ostby, Mr. Eng…","""male""",65.0,0,1,"""113509""",61.9792,"""B30""","""C"""
63,0,1,"""Harris, Mr. He…","""male""",45.0,1,0,"""36973""",83.475,"""C83""","""S"""
93,0,1,"""Chaffee, Mr. H…","""male""",46.0,1,0,"""W.E.P. 5734""",61.175,"""E31""","""S"""
95,0,3,"""Coxon, Mr. Dan…","""male""",59.0,0,0,"""364500""",7.25,,"""S"""
97,0,1,"""Goldschmidt, M…","""male""",71.0,0,0,"""PC 17754""",34.6542,"""A5""","""C"""


# Ускорение работы с Pandas (вторая часть)

Установка пакета

In [116]:
!pip install bottleneck



Импорт библиотек

In [23]:
import numpy as np
import pandas as pd
import bottleneck as bn

Чтение датасета 

In [22]:
df = pd.read_csv('train.csv')

Средний возраст и стандартное отклонение 

In [24]:
print(f'Средний возраст пассажиров\t {bn.nanmean(df.Age)}')
print(f'Стандартное отклонение возраста\t {bn.nanstd(df.Age)}')

Средний возраст пассажиров	 29.69911764705882
Стандартное отклонение возраста	 14.516321150817317


Создание нового столбца Fare_new

In [11]:
np_arr = df[['Fare', 'Pclass']].to_numpy()
new_col = [row[0]*(1.3*(row[1] < 3) + 1.1*(row[1] == 3)) for row in np_arr]
df.insert(10, 'Fare_new', new_col, True)
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Fare_new,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,7.97500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,92.66829,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,8.71750,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,69.03000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,8.85500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,16.90000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,39.00000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,25.79500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,39.00000,C148,C


# Оптимизация типов Pandas (третья часть)

Чтение датасета

In [12]:
df = pd.read_csv('Housing.csv')

Просмотр данных и общей информации о них

In [13]:
df

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,yes,no,yes,no,no,2,no,unfurnished
541,1767150,2400,3,1,1,no,no,no,no,no,0,no,semi-furnished
542,1750000,3620,2,1,1,yes,no,no,no,no,0,no,unfurnished
543,1750000,2910,3,1,1,no,no,no,no,no,0,no,furnished


Получение краткой информации

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


Сводка показателей

In [15]:
df.describe()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking
count,545.0,545.0,545.0,545.0,545.0,545.0
mean,4766729.0,5150.541284,2.965138,1.286239,1.805505,0.693578
std,1870440.0,2170.141023,0.738064,0.50247,0.867492,0.861586
min,1750000.0,1650.0,1.0,1.0,1.0,0.0
25%,3430000.0,3600.0,2.0,1.0,1.0,0.0
50%,4340000.0,4600.0,3.0,1.0,2.0,0.0
75%,5740000.0,6360.0,3.0,2.0,2.0,1.0
max,13300000.0,16200.0,6.0,4.0,4.0,3.0


Оптимизация использования памяти для различных типов данных:

* Для улучшения использования памяти, рекомендуется категоризировать строковые параметры (например, столбцы mainroad, guestroom, basement, hotwaterheating, airconditioning, prefarea, furnishingstatus).

* Для остальных численных параметров, учитывая их физический смысл, следует использовать беззнаковые типы, так как они не могут иметь отрицательные значения.

* Параметры bedrooms, bathrooms, stories, parking являются целочисленными и ограничены в пределах [0;6], поэтому для них рекомендуется использовать тип uint8.

* Значения параметра area также являются целочисленными и ограничены в пределах [1650;16200], поэтому для их хранения рекомендуется использовать тип uint16.

* Значения параметра price также являются целочисленными и ограничены в пределах [1.75e+6;1.33e+7], поэтому для их хранения рекомендуется использовать тип uint32.

Оптимизация хранения данных

In [16]:
df['price'] = df['price'].astype('uint32')
df['area'] = df['area'].astype('uint16')
df['bedrooms'] = df['bedrooms'].astype('uint8')
df['bathrooms'] = df['bathrooms'].astype('uint8')
df['stories'] = df['stories'].astype('uint8')
df['parking'] = df['parking'].astype('uint8')
df['mainroad'] = df['mainroad'].astype('category')
df['guestroom'] = df['guestroom'].astype('category')
df['basement'] = df['basement'].astype('category')
df['hotwaterheating'] = df['hotwaterheating'].astype('category')
df['airconditioning'] = df['airconditioning'].astype('category')
df['prefarea'] = df['prefarea'].astype('category')
df['furnishingstatus'] = df['furnishingstatus'].astype('category')

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   price             545 non-null    uint32  
 1   area              545 non-null    uint16  
 2   bedrooms          545 non-null    uint8   
 3   bathrooms         545 non-null    uint8   
 4   stories           545 non-null    uint8   
 5   mainroad          545 non-null    category
 6   guestroom         545 non-null    category
 7   basement          545 non-null    category
 8   hotwaterheating   545 non-null    category
 9   airconditioning   545 non-null    category
 10  parking           545 non-null    uint8   
 11  prefarea          545 non-null    category
 12  furnishingstatus  545 non-null    category
dtypes: category(7), uint16(1), uint32(1), uint8(4)
memory usage: 10.0 KB


В результате память сократилась в 5.5 раз.