# Constructor

In [1]:
import pandas as pd

Create DF without indexes

In [8]:
df = pd.DataFrame({
    'country': ['France', 'USA', 'Poland'],
    'capital': ['Paris', 'Washington', 'Warsaw'],
    'population': [66.99, 328.2, 32.17]
})
df

Unnamed: 0,country,capital,population
0,France,Paris,66.99
1,USA,Washington,328.2
2,Poland,Warsaw,32.17


Create DF with indexes

In [9]:
df2 = pd.DataFrame({
    'country': ['France', 'USA', 'Poland'],
    'capital': ['Paris', 'Washington', 'Warsaw'],
    'population': [66.99, 328.2, 32.17]
}, index = ['FR', 'US', 'PL'])
df2

Unnamed: 0,country,capital,population
FR,France,Paris,66.99
US,USA,Washington,328.2
PL,Poland,Warsaw,32.17


Add indexes for existing DF

In [11]:
df.index

RangeIndex(start=0, stop=3, step=1)

In [12]:
df.index = ['FR', 'US', 'PL']

In [13]:
df.index

Index(['FR', 'US', 'PL'], dtype='object')

In [14]:
df

Unnamed: 0,country,capital,population
FR,France,Paris,66.99
US,USA,Washington,328.2
PL,Poland,Warsaw,32.17


#### Get values from DF

Agruments order: **column / row**

In [15]:
df['capital']['PL']

'Warsaw'

In [17]:
df.capital.US

'Washington'

In [19]:
df.capital[0]

'Paris'

Agruments order: **row / column**

In [22]:
df.at['PL', 'capital']

'Warsaw'

In [23]:
df.iat[1, 1]

'Washington'

In [24]:
df.loc['US']

country              USA
capital       Washington
population         328.2
Name: US, dtype: object

In [25]:
df.loc['US', 'capital']

'Washington'

In [26]:
df.iloc[1, 1]

'Washington'

Get list of columns

In [27]:
df.columns

Index(['country', 'capital', 'population'], dtype='object')

# Data read

#### Read from TXT file

In [32]:
txt_df = pd.read_csv('test_data.txt', names=['id', 'city', 'population'])
txt_df

Unnamed: 0,id,city,population
0,1,Warsaw,1.86
1,2,Cracow,0.8
2,3,Wroclaw,0.67


In [37]:
cities_df = pd.read_csv('city.csv',
                     sep = ';',
                     names = ['ID', 'City', 'Code', 'District', 'Population'],
                     header = 0)
cities_df

Unnamed: 0,ID,City,Code,District,Population
0,1,Kabul,AFG,Kabol,1780000
1,2,Qandahar,AFG,Qandahar,237500
2,3,Herat,AFG,Herat,186800
3,4,Mazar-e-Sharif,AFG,Balkh,127800
4,5,Amsterdam,NLD,Noord-Holland,731200
...,...,...,...,...,...
4074,4075,Khan Yunis,PSE,Khan Yunis,123175
4075,4076,Hebron,PSE,Hebron,119401
4076,4077,Jabaliya,PSE,North Gaza,113901
4077,4078,Nablus,PSE,Nablus,100231


# Get info about DataFrame

In [3]:
cities_df = pd.read_csv('city.csv', sep = ';')
cities_df

Unnamed: 0,ID,Name,CountryCode,District,Population
0,1,Kabul,AFG,Kabol,1780000
1,2,Qandahar,AFG,Qandahar,237500
2,3,Herat,AFG,Herat,186800
3,4,Mazar-e-Sharif,AFG,Balkh,127800
4,5,Amsterdam,NLD,Noord-Holland,731200
...,...,...,...,...,...
4074,4075,Khan Yunis,PSE,Khan Yunis,123175
4075,4076,Hebron,PSE,Hebron,119401
4076,4077,Jabaliya,PSE,North Gaza,113901
4077,4078,Nablus,PSE,Nablus,100231


#### Show more data in preview: `head` and `tail`
Default: 5,
Max: 60

In [4]:
cities_df.head(10)

Unnamed: 0,ID,Name,CountryCode,District,Population
0,1,Kabul,AFG,Kabol,1780000
1,2,Qandahar,AFG,Qandahar,237500
2,3,Herat,AFG,Herat,186800
3,4,Mazar-e-Sharif,AFG,Balkh,127800
4,5,Amsterdam,NLD,Noord-Holland,731200
5,6,Rotterdam,NLD,Zuid-Holland,593321
6,7,Haag,NLD,Zuid-Holland,440900
7,8,Utrecht,NLD,Utrecht,234323
8,9,Eindhoven,NLD,Noord-Brabant,201843
9,10,Tilburg,NLD,Noord-Brabant,193238


In [5]:
cities_df.tail(10)

Unnamed: 0,ID,Name,CountryCode,District,Population
4069,4070,Chitungwiza,ZWE,Harare,274912
4070,4071,Mount Darwin,ZWE,Harare,164362
4071,4072,Mutare,ZWE,Manicaland,131367
4072,4073,Gweru,ZWE,Midlands,128037
4073,4074,Gaza,PSE,Gaza,353632
4074,4075,Khan Yunis,PSE,Khan Yunis,123175
4075,4076,Hebron,PSE,Hebron,119401
4076,4077,Jabaliya,PSE,North Gaza,113901
4077,4078,Nablus,PSE,Nablus,100231
4078,4079,Rafah,PSE,Rafah,92020


In [7]:
len(cities_df)

4079

In [8]:
cities_df.shape

(4079, 5)

In [9]:
cities_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4079 entries, 0 to 4078
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           4079 non-null   int64 
 1   Name         4079 non-null   object
 2   CountryCode  4079 non-null   object
 3   District     4075 non-null   object
 4   Population   4079 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 159.5+ KB


Select all records with no district

In [10]:
cities_df[cities_df['District'].isna()]

Unnamed: 0,ID,Name,CountryCode,District,Population
3284,3285,Taiping,TWN,,165524
3292,3293,Taliao,TWN,,115897
3293,3294,Kueishan,TWN,,112195
3562,3563,Ciudad Losada,VEN,,134501


Get stats for DataFrame

In [11]:
cities_df.describe()

Unnamed: 0,ID,Population
count,4079.0,4079.0
mean,2040.0,350468.2
std,1177.650203,723775.7
min,1.0,42.0
25%,1020.5,114789.0
50%,2040.0,167051.0
75%,3059.5,310638.5
max,4079.0,10500000.0


Get unique values

In [18]:
cities_df.CountryCode.unique()

array(['AFG', 'NLD', 'ANT', 'ALB', 'DZA', 'ASM', 'AND', 'AGO', 'AIA',
       'ATG', 'ARE', 'ARG', 'ARM', 'ABW', 'AUS', 'AZE', 'BHS', 'BHR',
       'BGD', 'BRB', 'BEL', 'BLZ', 'BEN', 'BMU', 'BTN', 'BOL', 'BIH',
       'BWA', 'BRA', 'GBR', 'VGB', 'BRN', 'BGR', 'BFA', 'BDI', 'CYM',
       'CHL', 'COK', 'CRI', 'DJI', 'DMA', 'DOM', 'ECU', 'EGY', 'SLV',
       'ERI', 'ESP', 'ZAF', 'ETH', 'FLK', 'FJI', 'PHL', 'FRO', 'GAB',
       'GMB', 'GEO', 'GHA', 'GIB', 'GRD', 'GRL', 'GLP', 'GUM', 'GTM',
       'GIN', 'GNB', 'GUY', 'HTI', 'HND', 'HKG', 'SJM', 'IDN', 'IND',
       'IRQ', 'IRN', 'IRL', 'ISL', 'ISR', 'ITA', 'TMP', 'AUT', 'JAM',
       'JPN', 'YEM', 'JOR', 'CXR', 'YUG', 'KHM', 'CMR', 'CAN', 'CPV',
       'KAZ', 'KEN', 'CAF', 'CHN', 'KGZ', 'KIR', 'COL', 'COM', 'COG',
       'COD', 'CCK', 'PRK', 'KOR', 'GRC', 'HRV', 'CUB', 'KWT', 'CYP',
       'LAO', 'LVA', 'LSO', 'LBN', 'LBR', 'LBY', 'LIE', 'LTU', 'LUX',
       'ESH', 'MAC', 'MDG', 'MKD', 'MWI', 'MDV', 'MYS', 'MLI', 'MLT',
       'MAR', 'MHL',

In [19]:
cities_df.CountryCode.unique().size

232

In [21]:
cities_df.columns

Index(['ID', 'Name', 'CountryCode', 'District', 'Population'], dtype='object')

Group, sort and calculate values

In [22]:
cities_df.CountryCode.value_counts()

CHN    363
IND    341
USA    274
BRA    250
JPN    248
      ... 
GIB      1
GRD      1
PNG      1
PLW      1
LUX      1
Name: CountryCode, Length: 232, dtype: int64

Get the same in %

In [23]:
cities_df.CountryCode.value_counts(normalize=True)

CHN    0.088992
IND    0.083599
USA    0.067173
BRA    0.061290
JPN    0.060799
         ...   
GIB    0.000245
GRD    0.000245
PNG    0.000245
PLW    0.000245
LUX    0.000245
Name: CountryCode, Length: 232, dtype: float64

# DataFrame options & settings

In [24]:
pd.get_option('display.max_rows')

60

In [27]:
pd.set_option('display.max_rows', 100)

In [29]:
cities_df.head(100)

Unnamed: 0,ID,Name,CountryCode,District,Population
0,1,Kabul,AFG,Kabol,1780000
1,2,Qandahar,AFG,Qandahar,237500
2,3,Herat,AFG,Herat,186800
3,4,Mazar-e-Sharif,AFG,Balkh,127800
4,5,Amsterdam,NLD,Noord-Holland,731200
5,6,Rotterdam,NLD,Zuid-Holland,593321
6,7,Haag,NLD,Zuid-Holland,440900
7,8,Utrecht,NLD,Utrecht,234323
8,9,Eindhoven,NLD,Noord-Brabant,201843
9,10,Tilburg,NLD,Noord-Brabant,193238


In [30]:
pd.get_option('min_rows')

10

In [31]:
pd.set_option('min_rows', 20)

In [32]:
cities_df

Unnamed: 0,ID,Name,CountryCode,District,Population
0,1,Kabul,AFG,Kabol,1780000
1,2,Qandahar,AFG,Qandahar,237500
2,3,Herat,AFG,Herat,186800
3,4,Mazar-e-Sharif,AFG,Balkh,127800
4,5,Amsterdam,NLD,Noord-Holland,731200
5,6,Rotterdam,NLD,Zuid-Holland,593321
6,7,Haag,NLD,Zuid-Holland,440900
7,8,Utrecht,NLD,Utrecht,234323
8,9,Eindhoven,NLD,Noord-Brabant,201843
9,10,Tilburg,NLD,Noord-Brabant,193238


Set default values

In [34]:
pd.reset_option('all')

  pd.reset_option('all')
  pd.reset_option('all')
: boolean
    use_inf_as_null had been deprecated and will be removed in a future
    version. Use `use_inf_as_na` instead.

  pd.reset_option('all')


Show more symbols in a column

In [39]:
pd.get_option('max_colwidth')

50

In [41]:
pd.set_option('max_colwidth', 100)

Show less numbers after a comma

In [50]:
pd.get_option('display.precision')

6

In [49]:
pd.set_option('display.precision', 6)

Show more columns

In [58]:
pd.get_option('display.max_columns')

20

In [57]:
pd.set_option('display.max_columns', 30)

Generate dummy data

In [61]:
pd.get_dummies(list(range(11)), prefix='column')

Unnamed: 0,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10
0,1,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,0
6,0,0,0,0,0,0,1,0,0,0,0
7,0,0,0,0,0,0,0,1,0,0,0
8,0,0,0,0,0,0,0,0,1,0,0
9,0,0,0,0,0,0,0,0,0,1,0
