In [44]:
import pandas as pd
df_one = pd.DataFrame({'k1':['A','A','B','B','C','C'],
                      'col1':[100,200,300,300,400,500],
                      'col2':['NY','CA','WA','WA','AK','NV']})

In [45]:
df_one

Unnamed: 0,k1,col1,col2
0,A,100,NY
1,A,200,CA
2,B,300,WA
3,B,300,WA
4,C,400,AK
5,C,500,NV


## Information on Unique Values 

In [46]:
df_one['col2'].unique()

array(['NY', 'CA', 'WA', 'AK', 'NV'], dtype=object)

In [47]:
df_one['k1'].unique()

array(['A', 'B', 'C'], dtype=object)

In [48]:
df_one['k1'].nunique()

3

In [49]:
df_one['col2'].value_counts()

WA    2
NY    1
CA    1
AK    1
NV    1
Name: col2, dtype: int64

In [50]:
df_one

Unnamed: 0,k1,col1,col2
0,A,100,NY
1,A,200,CA
2,B,300,WA
3,B,300,WA
4,C,400,AK
5,C,500,NV


In [51]:
df_one.drop_duplicates()

Unnamed: 0,k1,col1,col2
0,A,100,NY
1,A,200,CA
2,B,300,WA
4,C,400,AK
5,C,500,NV


### Creating New Columns with Operations and Functions

We already know we can easily create new columns through basic arithmetic operations:

In [52]:
df_one

Unnamed: 0,k1,col1,col2
0,A,100,NY
1,A,200,CA
2,B,300,WA
3,B,300,WA
4,C,400,AK
5,C,500,NV


In [53]:
df_one['New col'] = df_one['col1']*30

In [54]:
df_one

Unnamed: 0,k1,col1,col2,New col
0,A,100,NY,3000
1,A,200,CA,6000
2,B,300,WA,9000
3,B,300,WA,9000
4,C,400,AK,12000
5,C,500,NV,15000


But we can also create new columns by applying any custom function we want, as you can imagine, this could be as complex as we want, and gives us great flexibility.

Step 1: Define the function that will operate on every row entry in a column

In [55]:
def grab_firs_letter(state):
    return state[0]

In [56]:
grab_firs_letter('NY')

'N'

In [57]:
df_one['col2'].apply(grab_firs_letter)

0    N
1    C
2    W
3    W
4    A
5    N
Name: col2, dtype: object

In [58]:
df_one['first letter']=df_one['col2'].apply(grab_firs_letter)

In [59]:
df_one

Unnamed: 0,k1,col1,col2,New col,first letter
0,A,100,NY,3000,N
1,A,200,CA,6000,C
2,B,300,WA,9000,W
3,B,300,WA,9000,W
4,C,400,AK,12000,A
5,C,500,NV,15000,N


In [60]:
def complex_letter(state):
    if state[0] == "W":
        return 'Washington'
    else:
        return 'Error'

In [61]:
df_one['State Check'] = df_one['col2'].apply(complex_letter)

In [62]:
df_one

Unnamed: 0,k1,col1,col2,New col,first letter,State Check
0,A,100,NY,3000,N,Error
1,A,200,CA,6000,C,Error
2,B,300,WA,9000,W,Washington
3,B,300,WA,9000,W,Washington
4,C,400,AK,12000,A,Error
5,C,500,NV,15000,N,Error


In [63]:
# WATCH OUT FOR DATA TYPE ERRORS!
# You can't index numbers!
df_one['col1'].apply(complex_letter)

TypeError: 'int' object is not subscriptable

# Mapping

In [65]:
df_one['k1']

0    A
1    A
2    B
3    B
4    C
5    C
Name: k1, dtype: object

In [66]:
df_one

Unnamed: 0,k1,col1,col2,New col,first letter,State Check
0,A,100,NY,3000,N,Error
1,A,200,CA,6000,C,Error
2,B,300,WA,9000,W,Washington
3,B,300,WA,9000,W,Washington
4,C,400,AK,12000,A,Error
5,C,500,NV,15000,N,Error


In [67]:
df_one['new']=df_one['k1'].map({'A':1, 'B':2 , 'C':3})

In [68]:
df_one

Unnamed: 0,k1,col1,col2,New col,first letter,State Check,new
0,A,100,NY,3000,N,Error,1
1,A,200,CA,6000,C,Error,1
2,B,300,WA,9000,W,Washington,2
3,B,300,WA,9000,W,Washington,2
4,C,400,AK,12000,A,Error,3
5,C,500,NV,15000,N,Error,3


In [71]:
df_one['col1'].max()

500

In [72]:
df_one['col1'].idxmax()

5

In [70]:
df_one['col1'].min()

100

In [73]:
df_one['col1'].idxmin()

0

### Get column and index names:

In [74]:
df_one.columns

Index(['k1', 'col1', 'col2', 'New col', 'first letter', 'State Check', 'new'], dtype='object')

In [75]:
df_one.index

RangeIndex(start=0, stop=6, step=1)

In [78]:
df_one.columns = ['C1','c2','c3','c4','c5','c6','C7']

In [79]:
df_one

Unnamed: 0,C1,c2,c3,c4,c5,c6,C7
0,A,100,NY,3000,N,Error,1
1,A,200,CA,6000,C,Error,1
2,B,300,WA,9000,W,Washington,2
3,B,300,WA,9000,W,Washington,2
4,C,400,AK,12000,A,Error,3
5,C,500,NV,15000,N,Error,3


### Sorting and Ordering a DataFrame:


In [80]:
df_one

Unnamed: 0,C1,c2,c3,c4,c5,c6,C7
0,A,100,NY,3000,N,Error,1
1,A,200,CA,6000,C,Error,1
2,B,300,WA,9000,W,Washington,2
3,B,300,WA,9000,W,Washington,2
4,C,400,AK,12000,A,Error,3
5,C,500,NV,15000,N,Error,3


In [81]:
df_one.sort_values('c3')

Unnamed: 0,C1,c2,c3,c4,c5,c6,C7
4,C,400,AK,12000,A,Error,3
1,A,200,CA,6000,C,Error,1
5,C,500,NV,15000,N,Error,3
0,A,100,NY,3000,N,Error,1
2,B,300,WA,9000,W,Washington,2
3,B,300,WA,9000,W,Washington,2


# DataFrame birleştirme

In [82]:
features = pd.DataFrame({'A':[100,200,300,400,500],
                        'B':[12,13,14,15,16]})
predictions = pd.DataFrame({'pred':[0,1,1,0,1]})

In [83]:
features

Unnamed: 0,A,B
0,100,12
1,200,13
2,300,14
3,400,15
4,500,16


In [84]:
predictions

Unnamed: 0,pred
0,0
1,1
2,1
3,0
4,1


In [88]:
pd.concat([features,predictions],axis=1)

Unnamed: 0,A,B,pred
0,100,12,0
1,200,13,1
2,300,14,1
3,400,15,0
4,500,16,1


In [89]:
pd.concat([predictions,features],axis=1)

Unnamed: 0,pred,A,B
0,0,100,12
1,1,200,13
2,1,300,14
3,0,400,15
4,1,500,16


In [90]:
df_one

Unnamed: 0,C1,c2,c3,c4,c5,c6,C7
0,A,100,NY,3000,N,Error,1
1,A,200,CA,6000,C,Error,1
2,B,300,WA,9000,W,Washington,2
3,B,300,WA,9000,W,Washington,2
4,C,400,AK,12000,A,Error,3
5,C,500,NV,15000,N,Error,3


In [91]:
df_one['C1']

0    A
1    A
2    B
3    B
4    C
5    C
Name: C1, dtype: object

In [92]:
pd.get_dummies(df_one['C1'])

Unnamed: 0,A,B,C
0,1,0,0
1,1,0,0
2,0,1,0
3,0,1,0
4,0,0,1
5,0,0,1
