In [1]:
import pandas as pd


In [4]:
tech = [["Spark",2000,"30days"],
        ["pandas",3000,"40days"],
       ]

df = pd.DataFrame(tech)

df

Unnamed: 0,0,1,2
0,Spark,2000,30days
1,pandas,3000,40days


In [5]:
colname = ["Course","Fee","Duration"]

rowlabel = ["a","b"]

df= pd.DataFrame(tech,columns=colname,index=rowlabel)


In [6]:
df

Unnamed: 0,Course,Fee,Duration
a,Spark,2000,30days
b,pandas,3000,40days


In [14]:
df.dtypes

Course      object
Fee          int64
Duration    object
dtype: object

In [16]:
# set custom types to DataFrame
types = {'Course':str,'Fee':float,'Duration':str}

df = df.astype(types)

df

Unnamed: 0,Course,Fee,Duration
a,Spark,2000.0,30days
b,pandas,3000.0,40days


In [17]:
df.dtypes

Course       object
Fee         float64
Duration     object
dtype: object

In [18]:
# Create DataFrame from Dictionary
technologies = {
    'Courses':["Spark","PySpark","Hadoop"],
    'Fee' :[20000,25000,26000],
    'Duration':['30day','40days','35days'],
    'Discount':[1000,2300,1500]
              }

df2 = pd.DataFrame(technologies)

df2

Unnamed: 0,Courses,Fee,Duration,Discount
0,Spark,20000,30day,1000
1,PySpark,25000,40days,2300
2,Hadoop,26000,35days,1500


In [20]:
# Create DataFrame with None/Null to work with examples
import pandas as pd
import numpy as np
technologies   = ({
    'Courses':["Spark","PySpark","Hadoop","Python","Pandas",None,"Spark","Python"],
    'Fee' :[22000,25000,23000,24000,np.nan,25000,25000,22000],
    'Duration':['30day','50days','55days','40days','60days','35day','','50days'],
    'Discount':[1000,2300,1000,1200,2500,1300,1400,1600]
          })
row_labels=['r0','r1','r2','r3','r4','r5','r6','r7']
df = pd.DataFrame(technologies, index=row_labels)

df

Unnamed: 0,Courses,Fee,Duration,Discount
r0,Spark,22000.0,30day,1000
r1,PySpark,25000.0,50days,2300
r2,Hadoop,23000.0,55days,1000
r3,Python,24000.0,40days,1200
r4,Pandas,,60days,2500
r5,,25000.0,35day,1300
r6,Spark,25000.0,,1400
r7,Python,22000.0,50days,1600


In [21]:
df.shape

(8, 4)

In [22]:
# Returns number of cells. It would be rows * columns.
df.size

32

In [23]:
df.columns

Index(['Courses', 'Fee', 'Duration', 'Discount'], dtype='object')

In [24]:
# Returns column names from the header as a list in pandas.
a = df.columns.values

a

array(['Courses', 'Fee', 'Duration', 'Discount'], dtype=object)

In [25]:
df.index

Index(['r0', 'r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7'], dtype='object')

In [26]:
df.index.values

array(['r0', 'r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7'], dtype=object)

In [27]:
# Pandas Select Columns by Name.
df['Fee']

r0    22000.0
r1    25000.0
r2    23000.0
r3    24000.0
r4        NaN
r5    25000.0
r6    25000.0
r7    22000.0
Name: Fee, dtype: float64

In [28]:
#  select multiple columns
df[['Fee','Duration']]

Unnamed: 0,Fee,Duration
r0,22000.0,30day
r1,25000.0,50days
r2,23000.0,55days
r3,24000.0,40days
r4,,60days
r5,25000.0,35day
r6,25000.0,
r7,22000.0,50days


In [29]:
# Filter DataFrame

df2 = df[df['Fee']==22000]

df2

Unnamed: 0,Courses,Fee,Duration,Discount
r0,Spark,22000.0,30day,1000
r7,Python,22000.0,50days,1600


In [31]:
# Selectâ€™s Row from 6th Index
df2=df[6:]
df2

Unnamed: 0,Courses,Fee,Duration,Discount
r6,Spark,25000.0,,1400
r7,Python,22000.0,50days,1600


In [32]:
df['Duration'][3]

'40days'

In [33]:
df["Duration"].values[3]

'40days'

In [34]:
df['Fee']-500

r0    21500.0
r1    24500.0
r2    22500.0
r3    23500.0
r4        NaN
r5    24500.0
r6    24500.0
r7    21500.0
Name: Fee, dtype: float64

In [36]:
# Describe DataFrame for all numberic columns

df.describe()

Unnamed: 0,Fee,Discount
count,7.0,8.0
mean,23714.285714,1537.5
std,1380.131119,570.557372
min,22000.0,1000.0
25%,22500.0,1150.0
50%,24000.0,1350.0
75%,25000.0,1775.0
max,25000.0,2500.0


In [38]:
df2 =df.filter(items=['Courses','Fee'])

df2

Unnamed: 0,Courses,Fee
r0,Spark,22000.0
r1,PySpark,25000.0
r2,Hadoop,23000.0
r3,Python,24000.0
r4,Pandas,
r5,,25000.0
r6,Spark,25000.0
r7,Python,22000.0


In [39]:
# Note that items param is used to match on exact values. Use like param to match substring

df.filter(like='ration',axis=1)

Unnamed: 0,Duration
r0,30day
r1,50days
r2,55days
r3,40days
r4,60days
r5,35day
r6,
r7,50days


In [42]:
df.filter(like='ee',axis=1)

Unnamed: 0,Fee
r0,22000.0
r1,25000.0
r2,23000.0
r3,24000.0
r4,
r5,25000.0
r6,25000.0
r7,22000.0


In [40]:
# To filter columns with regular expressions, use regex param. The below example filters column that ends with the character e

df.filter(regex='e$',axis=1)

Unnamed: 0,Fee
r0,22000.0
r1,25000.0
r2,23000.0
r3,24000.0
r4,
r5,25000.0
r6,25000.0
r7,22000.0


Unnamed: 0,Courses,Fee,Duration,Discount


In [44]:
df

Unnamed: 0,Courses,Fee,Duration,Discount
r0,Spark,22000.0,30day,1000
r1,PySpark,25000.0,50days,2300
r2,Hadoop,23000.0,55days,1000
r3,Python,24000.0,40days,1200
r4,Pandas,,60days,2500
r5,,25000.0,35day,1300
r6,Spark,25000.0,,1400
r7,Python,22000.0,50days,1600


In [50]:
df2=df.filter(items=['r3','r4'], axis=0)
print(df2)

Empty DataFrame
Columns: [Courses, Fee, Duration]
Index: []


In [47]:
import pandas as pd
technologies= {
    'Courses':["Spark","PySpark","Spark","Java","PySpark","PHP"],
    'Fee' :[22000,25000,23000,24000,26000,27000],
    'Duration':['30days','50days','30days','60days','35days','30days']
          }
df = pd.DataFrame(technologies)
print(df)

   Courses    Fee Duration
0    Spark  22000   30days
1  PySpark  25000   50days
2    Spark  23000   30days
3     Java  24000   60days
4  PySpark  26000   35days
5      PHP  27000   30days


In [48]:
# pandas filter() by Index
# Use axis=0 on filter() function to filter rows by index (indices). The below example filters rows by index 3 and 5.


df2=df.filter(items=[3,5], axis=0)
print(df2)

  Courses    Fee Duration
3    Java  24000   60days
5     PHP  27000   30days
