# Getting Started with pandas

In [1]:
import pandas as pd
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
' year': [2000, 2001, 2002, 2001, 2002, 2003],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,year,pop,state
0,2000,1.5,Ohio
1,2001,1.7,Ohio
2,2002,3.6,Ohio
3,2001,2.4,Nevada
4,2002,2.9,Nevada
5,2003,3.2,Nevada


### DataFrame

In [2]:
import pandas as pd
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame = pd.DataFrame(pop)
frame

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [7]:
frame=pd.DataFrame(pop, index=[2000,2001, 2002, 2003])
frame

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [10]:
 pdata = {'Ohio': frame['Ohio'][:-1],
          'Nevada': frame['Nevada'][:2]}
frame1 = pd.DataFrame(pdata)
frame1

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,,3.6


## Dropping Entries from an Axis

### Row delete
drop method will return a new object with the indicated value or values deleted from
an axis:

In [21]:
import pandas as pd
import numpy as np
data = pd.DataFrame(np.arange(16).reshape((4, 4)),index=['Ohio', 'Colorado', 'Utah', 'New York'],
        columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [3]:
new_obj = data.drop('Utah')
new_obj

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,12,13,14,15


In [4]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


no change in main data frame
#### এখন আমরা কলাম ডিলিট করতে চাই 
You can drop values from the columns by passing axis=1 or axis='columns':

In [5]:
data.drop(['two','four'], axis=1)

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


In [6]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


Many functions, like drop, which modify the size or shape of a Series or DataFrame,
can manipulate an object in-place without returning a new object:

In [22]:
data.drop('Utah', inplace = True)

In [23]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,12,13,14,15


### Indexing, Selection, and Filtering

In [36]:
frame1 = pd.DataFrame(np.arange(16).reshape((4, 4)),index=['Ohio', 'Colorado', 'Utah', 'New York'],
                          columns=['one', 'two', 'three', 'four'])
frame1

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [26]:
frame1<5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [34]:
frame1[frame1<5] = 0

In [35]:
frame1

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


### Selection with loc and iloc
 axis labels (loc) or integers(iloc)

In [37]:
import pandas as pd
import numpy as np
data = pd.DataFrame(np.arange(16).reshape((4, 4)),index=['Ohio', 'Colorado', 'Utah', 'New York'],
        columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [39]:
data.loc['Utah',['two','four']]

two      9
four    11
Name: Utah, dtype: int64

In [40]:
 data.iloc[2, [3, 0, 1]]

four    11
one      8
two      9
Name: Utah, dtype: int64

In [42]:
data.iloc[3]

one      12
two      13
three    14
four     15
Name: New York, dtype: int64

In [43]:
# Both indexing functions work with slices in addition to single labels or lists of labels:
data.loc[:'Utah','two']

Ohio        1
Colorado    5
Utah        9
Name: two, dtype: int64

In [44]:
data.loc[:,:'three'][data.three>5]

Unnamed: 0,one,two,three
Colorado,4,5,6
Utah,8,9,10
New York,12,13,14


## Arithmetic and Data Alignment
When you are adding together objects, if any index pairs are not the same, the respective index in the result will be the union of the index pairs.

In [2]:
import pandas as pd
import numpy as np
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],index=['a', 'c', 'e', 'f', 'g'])

In [3]:
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [4]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [5]:
s1+s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [10]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),columns=list('abcd'))
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [8]:
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [11]:
df1+df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [12]:
# fill_na
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [13]:
1/df1

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [14]:
# same as 1/df1
df1.rdiv(1)

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [15]:
 df1.reindex(columns=df2.columns, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


#### fill_value এই ফ্ল্যাগটার মানে হচ্ছে যদি কোন ভেলু থাকে তাহলে ঐ ভেলুটাই ডিফল্ট হিসেবে বসে যাবে আর যদি না থাকে তাহলে fill_value ফ্ল্যাগ এ যে মান টা দেওয়া থাকবে ঐটাই বসে যাবে 

### Function Application and Mapping

In [3]:
import pandas as pd
import numpy as np
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,1.742873,0.009234,0.165569
Ohio,-0.907444,0.650609,-1.190678
Texas,-0.655171,2.730932,0.522033
Oregon,0.377202,0.168659,-0.114333


In [5]:
frame.abs()

Unnamed: 0,b,d,e
Utah,1.742873,0.009234,0.165569
Ohio,0.907444,0.650609,1.190678
Texas,0.655171,2.730932,0.522033
Oregon,0.377202,0.168659,0.114333


In [7]:
f = lambda x:x.max() - x.min()
frame.apply(f)

b    2.650317
d    2.721699
e    1.712712
dtype: float64

If you pass axis='columns' to apply, the function will be invoked once per row instead:

In [8]:
frame.apply(f, axis = 'columns')

Utah      1.733639
Ohio      1.841287
Texas     3.386103
Oregon    0.491535
dtype: float64

The function passed to apply need not return a scalar value; it can also return a Series with multiple values:

In [9]:
def f(x):
    return pd.Series([x.min(),x.max()], index=['min','max'] )
frame.apply(f)

Unnamed: 0,b,d,e
min,-0.907444,0.009234,-1.190678
max,1.742873,2.730932,0.522033


In [13]:
# Element wise operation 

formate = lambda x : '% .2f' %x
frame['d'].map(formate)

Utah       0.01
Ohio       0.65
Texas      2.73
Oregon     0.17
Name: d, dtype: object

In [14]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])

In [15]:
obj

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [16]:
obj.mean()

2.7142857142857144

In [17]:
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [18]:
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

### Axis Indexes with Duplicate Labels

In [21]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [23]:
obj.index.is_unique

False

In [24]:
obj['a']

a    0
a    1
dtype: int64

In [25]:
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df

Unnamed: 0,0,1,2
a,-1.495782,-1.13382,-1.50313
a,-2.114399,0.122557,-1.090489
b,0.05966,-1.134746,2.774982
b,1.60075,1.235816,0.920051


In [27]:
df.loc['b']

Unnamed: 0,0,1,2
b,0.05966,-1.134746,2.774982
b,1.60075,1.235816,0.920051


### 5.3 Summarizing and Computing Descriptive Statistics

In [28]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],[np.nan, np.nan], [0.75, -1.3]],index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


Calling DataFrame’s sum method returns a Series containing column sums:

In [29]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [31]:
# Passing axis='columns' or axis=1 sums across the columns instead:
df.sum(axis='columns')

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [36]:
df.mean(axis='columns', skipna=True)

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

Some methods, like idxmin and idxmax, return indirect statistics like the index value where the minimum or maximum values are attained:

In [39]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [38]:
df.idxmax()

one    b
two    d
dtype: object

In [41]:
# some method are accumulation
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [42]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


#### On non-numeric data, describe produces alternative summary statistics:

In [43]:
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)
obj

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [44]:
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

### Unique Values, Value Counts, and Membership

In [3]:
import pandas as pd
obj2 = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
obj2

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [4]:
obj2.unique()

array(['c', 'a', 'd', 'b'], dtype=object)