# Basic functions in the Pandas library

In [1]:
import pandas as pd
import numpy as np

df1 = pd.DataFrame({
    'customer':['Allianz', 'Generali', 'Union'],
    'clients': [120000, 200000, 150000],
    'color': ['blue', 'orange', 'red'],
    'value': [2000000, 15000000, 3000000]
})

print(df1)

   customer  clients   color     value
0   Allianz   120000    blue   2000000
1  Generali   200000  orange  15000000
2     Union   150000     red   3000000


## Built-in functions

In [2]:
print(df1['customer'].unique())

['Allianz' 'Generali' 'Union']


In [3]:
print(df1.columns)

Index(['customer', 'clients', 'color', 'value'], dtype='object')


In [4]:
print(df1['value'].mean())

6666666.666666667


In [5]:
print(df1['value'].min(), df1['value'].max(), sep="\n\n")

2000000

15000000


In [6]:
print(df1.describe())

             clients         value
count       3.000000  3.000000e+00
mean   156666.666667  6.666667e+06
std     40414.518843  7.234178e+06
min    120000.000000  2.000000e+06
25%    135000.000000  2.500000e+06
50%    150000.000000  3.000000e+06
75%    175000.000000  9.000000e+06
max    200000.000000  1.500000e+07


In [7]:
print(df1.head())

   customer  clients   color     value
0   Allianz   120000    blue   2000000
1  Generali   200000  orange  15000000
2     Union   150000     red   3000000


### Let's see it as a pretty-formatted table

Use **display()** instead of print():

In [9]:
display(df1.tail())

Unnamed: 0,customer,clients,color,value
0,Allianz,120000,blue,2000000
1,Generali,200000,orange,15000000
2,Union,150000,red,3000000


### Histogram

In [63]:
s = pd.Series(np.random.randint(0,7,size=10))
s.value_counts()

6    4
5    3
0    2
1    1
dtype: int64

## User-defined functions

### Conditional filtering

In [11]:
newdf = df1[df1['value']<15000000]
display(newdf)

Unnamed: 0,customer,clients,color,value
0,Allianz,120000,blue,2000000
2,Union,150000,red,3000000


In [26]:
newdf = df1[(df1['value']<15000000) & (df1['color'] != 'blue')] # Be careful about logical and symbol's single sign
display(newdf)

Unnamed: 0,customer,clients,color,value
2,Union,150000,red,3000000


### Another pretty-print solution

In [27]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [28]:
df1
newdf

Unnamed: 0,customer,clients,color,value
0,Allianz,120000,blue,2000000
1,Generali,200000,orange,15000000
2,Union,150000,red,3000000


Unnamed: 0,customer,clients,color,value
2,Union,150000,red,3000000


## Define a custom function

In [29]:
def profit(s):
    return s*0.5

In [30]:
df1['value'].apply(profit)

0    1000000.0
1    7500000.0
2    1500000.0
Name: value, dtype: float64

In [31]:
df1

Unnamed: 0,customer,clients,color,value
0,Allianz,120000,blue,2000000
1,Generali,200000,orange,15000000
2,Union,150000,red,3000000


The original df1 has not been changed.

In [33]:
clength = df1['customer'].apply(len)
clength

0    7
1    8
2    5
Name: customer, dtype: int64

In [34]:
df2 = df1[['clients', 'value']]
df2

Unnamed: 0,clients,value
0,120000,2000000
1,200000,15000000
2,150000,3000000


In [35]:
df3 = df2.applymap(profit)
df3

Unnamed: 0,clients,value
0,60000.0,1000000.0
1,100000.0,7500000.0
2,75000.0,1500000.0


### Using lambdas

Same as the profit() but by using lambda expression.

In [36]:
df4 = df2.applymap(lambda x: x*0.5)
df4

Unnamed: 0,clients,value
0,60000.0,1000000.0
1,100000.0,7500000.0
2,75000.0,1500000.0


### Difference between apply() and applymap()

In [37]:
def col_sum(co):
    return sum(co)

df5 = df2.apply(col_sum)
df5

df6 = df2.applymap(col_sum)
df6

clients      470000
value      20000000
dtype: int64

TypeError: 'int' object is not iterable

**apply() passes a Serie object** to the function and col_sum calculates the summary of the Serie. But **applymap() passes the data element-wise** and we cannot calculate the sum of an _int_. This is the reason of the error above.

## Sort and group DataFrames

In [38]:
df1.sort_values(by="value")

Unnamed: 0,customer,clients,color,value
0,Allianz,120000,blue,2000000
2,Union,150000,red,3000000
1,Generali,200000,orange,15000000


### Group by a column

In [39]:
mydict = {
    'customer': ['cust1', 'cust1', 'cust2', 'cust2', 'cust3'],
    'product1': [100, 200, 200, 150, 100],
    'product2': [50, 80, 60, 120, 90]
}

In [40]:
purchases = pd.DataFrame(mydict, index=['Purchase1', 'Purchase2', 'Purchase3', 'Purchase4', 'Purchase5'])

In [48]:
purchases

Unnamed: 0,customer,product1,product2
Purchase1,cust1,100,50
Purchase2,cust1,200,80
Purchase3,cust2,200,60
Purchase4,cust2,150,120
Purchase5,cust3,100,90


In [47]:
purchases.groupby('customer')
purchases.groupby('customer').describe()

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000027034360910>

Unnamed: 0_level_0,product1,product1,product1,product1,product1,product1,product1,product1,product2,product2,product2,product2,product2,product2,product2,product2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
customer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
cust1,2.0,150.0,70.710678,100.0,125.0,150.0,175.0,200.0,2.0,65.0,21.213203,50.0,57.5,65.0,72.5,80.0
cust2,2.0,175.0,35.355339,150.0,162.5,175.0,187.5,200.0,2.0,90.0,42.426407,60.0,75.0,90.0,105.0,120.0
cust3,1.0,100.0,,100.0,100.0,100.0,100.0,100.0,1.0,90.0,,90.0,90.0,90.0,90.0,90.0


In [53]:
grouped_data = purchases.groupby('customer')
grouped_data.mean()
grouped_data.std()

Unnamed: 0_level_0,product1,product2
customer,Unnamed: 1_level_1,Unnamed: 2_level_1
cust1,150,65
cust2,175,90
cust3,100,90


Unnamed: 0_level_0,product1,product2
customer,Unnamed: 1_level_1,Unnamed: 2_level_1
cust1,70.710678,21.213203
cust2,35.355339,42.426407
cust3,,


## Save and load dataframes

In [50]:
purchases.to_csv('purchases.csv', index=True)

In [51]:
newdf = pd.read_csv('purchases.csv', index_col=0)

In [52]:
newdf

Unnamed: 0,customer,product1,product2
Purchase1,cust1,100,50
Purchase2,cust1,200,80
Purchase3,cust2,200,60
Purchase4,cust2,150,120
Purchase5,cust3,100,90


# Some experiments

In [54]:
x = np.array([
    ('Rex', 9.81, '80'),
    ('Fox', 11.2, '75'),
    ('Crux', 15.6, 'shit')
])

In [55]:
x

array([['Rex', '9.81', '80'],
       ['Fox', '11.2', '75'],
       ['Crux', '15.6', 'shit']], dtype='<U4')

In [56]:
display(x)

array([['Rex', '9.81', '80'],
       ['Fox', '11.2', '75'],
       ['Crux', '15.6', 'shit']], dtype='<U4')

In [58]:
x.names()

AttributeError: 'numpy.ndarray' object has no attribute 'names'