# Pandas
Import library

In [4]:
import pandas as pd

Create dataframe by dict

In [9]:
frame = pd.DataFrame({'numbers':range(10), 'chars':['a']*10})

In [10]:
frame

Unnamed: 0,chars,numbers
0,a,0
1,a,1
2,a,2
3,a,3
4,a,4
5,a,5
6,a,6
7,a,7
8,a,8
9,a,9


Create dataframe by data from file

In [19]:
frame = pd.read_csv('dataset.tsv', header=0, sep='\t')

In [20]:
frame

Unnamed: 0,Name,Birth,City,Position
0,Ivanov A.A.,22.03.1980,Moscow,
1,Sorokin I.V.,07.08.1965,Volgograd,Engineer
2,Белов М.М.,13.02.1980,Rostov,Manager
3,Мельникова Д.С.,15.04.1985,Rostov,
4,Rybina E.P.,19.11.1985,Moscow,Engineer
5,Kostrov C.O.,31.05.1985,Moscow,Intern


Get frame columns

In [21]:
frame.columns

Index(['Name', 'Birth', 'City', 'Position'], dtype='object')

Get dataframe size (rows and cols)

In [22]:
frame.shape

(6, 4)

Add new line to data frame

In [23]:
new_line = {'Name':'Perov', 'Birth':'22.03.1990', 'City':'Penza'}
frame = frame.append(new_line, ignore_index=True)

In [24]:
frame

Unnamed: 0,Name,Birth,City,Position
0,Ivanov A.A.,22.03.1980,Moscow,
1,Sorokin I.V.,07.08.1965,Volgograd,Engineer
2,Белов М.М.,13.02.1980,Rostov,Manager
3,Мельникова Д.С.,15.04.1985,Rostov,
4,Rybina E.P.,19.11.1985,Moscow,Engineer
5,Kostrov C.O.,31.05.1985,Moscow,Intern
6,Perov,22.03.1990,Penza,


Add column to data frame

In [25]:
frame['IsStudent'] = [False]*5 + [True]*2

In [26]:
frame

Unnamed: 0,Name,Birth,City,Position,IsStudent
0,Ivanov A.A.,22.03.1980,Moscow,,False
1,Sorokin I.V.,07.08.1965,Volgograd,Engineer,False
2,Белов М.М.,13.02.1980,Rostov,Manager,False
3,Мельникова Д.С.,15.04.1985,Rostov,,False
4,Rybina E.P.,19.11.1985,Moscow,Engineer,False
5,Kostrov C.O.,31.05.1985,Moscow,Intern,True
6,Perov,22.03.1990,Penza,,True


Delete lines from data frame

In [27]:
frame = frame.drop([5,6], axis=0)

In [28]:
frame

Unnamed: 0,Name,Birth,City,Position,IsStudent
0,Ivanov A.A.,22.03.1980,Moscow,,False
1,Sorokin I.V.,07.08.1965,Volgograd,Engineer,False
2,Белов М.М.,13.02.1980,Rostov,Manager,False
3,Мельникова Д.С.,15.04.1985,Rostov,,False
4,Rybina E.P.,19.11.1985,Moscow,Engineer,False


Delete columns from data frame

In [29]:
frame.drop('IsStudent', axis=1, inplace=True)

Write changed dataset to file

In [30]:
frame.to_csv('updated_dataset.csv', sep=',', header=True, index=False)

Get file content via Bash

In [31]:
!cat updated_dataset.csv

Name,Birth,City,Position
Ivanov A.A.,22.03.1980,Moscow,
Sorokin I.V.,07.08.1965,Volgograd,Engineer
Белов М.М.,13.02.1980,Rostov,Manager
Мельникова Д.С.,15.04.1985,Rostov,
Rybina E.P.,19.11.1985,Moscow,Engineer


Data frame colums types

In [33]:
frame.dtypes

Name        object
Birth       object
City        object
Position    object
dtype: object

Change column type

In [34]:
frame.Birth = frame.Birth.apply(pd.to_datetime)

In [36]:
frame.dtypes

Name                object
Birth       datetime64[ns]
City                object
Position            object
dtype: object

Data frame info

In [37]:
frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5 entries, 0 to 4
Data columns (total 4 columns):
Name        5 non-null object
Birth       5 non-null datetime64[ns]
City        5 non-null object
Position    3 non-null object
dtypes: datetime64[ns](1), object(3)
memory usage: 200.0+ bytes


Fill missed values with default value

In [40]:
frame.fillna('Missed', inplace=True)

In [41]:
frame

Unnamed: 0,Name,Birth,City,Position
0,Ivanov A.A.,1980-03-22,Moscow,Missed
1,Sorokin I.V.,1965-07-08,Volgograd,Engineer
2,Белов М.М.,1980-02-13,Rostov,Manager
3,Мельникова Д.С.,1985-04-15,Rostov,Missed
4,Rybina E.P.,1985-11-19,Moscow,Engineer


In [42]:
frame.Position

0      Missed
1    Engineer
2     Manager
3      Missed
4    Engineer
Name: Position, dtype: object

In [43]:
frame[['Position']]

Unnamed: 0,Position
0,Missed
1,Engineer
2,Manager
3,Missed
4,Engineer


In [44]:
frame[['Name', 'Position']]

Unnamed: 0,Name,Position
0,Ivanov A.A.,Missed
1,Sorokin I.V.,Engineer
2,Белов М.М.,Manager
3,Мельникова Д.С.,Missed
4,Rybina E.P.,Engineer


Select first N rows

In [45]:
frame[:3]

Unnamed: 0,Name,Birth,City,Position
0,Ivanov A.A.,1980-03-22,Moscow,Missed
1,Sorokin I.V.,1965-07-08,Volgograd,Engineer
2,Белов М.М.,1980-02-13,Rostov,Manager


Select last N rows

In [46]:
frame[-3:]

Unnamed: 0,Name,Birth,City,Position
2,Белов М.М.,1980-02-13,Rostov,Manager
3,Мельникова Д.С.,1985-04-15,Rostov,Missed
4,Rybina E.P.,1985-11-19,Moscow,Engineer


In [47]:
frame.loc[[0,1,2], ["Name", "City"]]

Unnamed: 0,Name,City
0,Ivanov A.A.,Moscow
1,Sorokin I.V.,Volgograd
2,Белов М.М.,Rostov


In [49]:
frame.iloc[[1,3], [0,1]]

Unnamed: 0,Name,Birth
1,Sorokin I.V.,1965-07-08
3,Мельникова Д.С.,1985-04-15


In [51]:
frame.ix[[0,1], ["Name", "City"]]

Unnamed: 0,Name,City
0,Ivanov A.A.,Moscow
1,Sorokin I.V.,Volgograd


In [52]:
frame.ix[[0,1,2], [0,1]]

Unnamed: 0,Name,Birth
0,Ivanov A.A.,1980-03-22
1,Sorokin I.V.,1965-07-08
2,Белов М.М.,1980-02-13


Selects

In [53]:
frame[frame.Birth >= pd.datetime(1985,1,1)]

Unnamed: 0,Name,Birth,City,Position
3,Мельникова Д.С.,1985-04-15,Rostov,Missed
4,Rybina E.P.,1985-11-19,Moscow,Engineer


In [54]:
frame[(frame.Birth >= pd.datetime(1985,1,1)) &
      (frame.City != 'Moscow')]

Unnamed: 0,Name,Birth,City,Position
3,Мельникова Д.С.,1985-04-15,Rostov,Missed


In [55]:
frame[(frame.Birth >= pd.datetime(1985,1,1)) |
      (frame.City == 'Volgograd')]

Unnamed: 0,Name,Birth,City,Position
1,Sorokin I.V.,1965-07-08,Volgograd,Engineer
3,Мельникова Д.С.,1985-04-15,Rostov,Missed
4,Rybina E.P.,1985-11-19,Moscow,Engineer
