### Loading Library

In [1]:
import numpy as np
import pandas as pd

### 1. Create DataFrame

In [3]:
data = {
    'roll_no' : [3, 2, 7, 11], 
    'ppr_id' : [34, 21, 10, 11], 
    'marks' : [30, 23, 17, 27], 
}

In [4]:
data

{'roll_no': [3, 2, 7, 11],
 'ppr_id': [34, 21, 10, 11],
 'marks': [30, 23, 17, 27]}

In [8]:
df1 = pd.DataFrame(data)

In [10]:
df1

Unnamed: 0,roll_no,ppr_id,marks
0,3,34,30
1,2,21,23
2,7,10,17
3,11,11,27


### 2.  Setting index

In [13]:
df2 = pd.DataFrame(data, index = ['ab', 'ef', 'xy', 'uv'])

In [15]:
df2

Unnamed: 0,roll_no,ppr_id,marks
ab,3,34,30
ef,2,21,23
xy,7,10,17
uv,11,11,27


### 3. Extracting Info

In [18]:
df2.loc['xy']

roll_no     7
ppr_id     10
marks      17
Name: xy, dtype: int64

In [20]:
# last column value

df2.iloc[:, -1]

ab    30
ef    23
xy    17
uv    27
Name: marks, dtype: int64

In [22]:
# extracting from specific rows and columns

df2.iloc[0:2, 2:3]

Unnamed: 0,marks
ab,30
ef,23


## Working on CSV

### 1. Loading Data

In [26]:
df = pd.read_csv('iris_dataset.csv')


In [28]:
# view of data

df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [30]:
# view of data

df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [32]:
df.tail(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
140,6.7,3.1,5.6,2.4
141,6.9,3.1,5.1,2.3
142,5.8,2.7,5.1,1.9
143,6.8,3.2,5.9,2.3
144,6.7,3.3,5.7,2.5
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3
149,5.9,3.0,5.1,1.8


### 2. Data Info

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
dtypes: float64(4)
memory usage: 4.8 KB


### 3. Data Description

In [38]:
df.head(6)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
5,5.4,3.9,1.7,0.4


In [40]:
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


### 4.  Data Selection

In [45]:
# pandas series

df['sepal_width'][:10]

0    3.5
1    3.0
2    3.2
3    3.1
4    3.6
5    3.9
6    3.4
7    3.4
8    2.9
9    3.1
Name: sepal_width, dtype: float64

In [47]:
# pandas dataframe

df[['sepal_width']][:10]

Unnamed: 0,sepal_width
0,3.5
1,3.0
2,3.2
3,3.1
4,3.6
5,3.9
6,3.4
7,3.4
8,2.9
9,3.1


In [49]:
df[['sepal_width', 'petal_width']].head()

Unnamed: 0,sepal_width,petal_width
0,3.5,0.2
1,3.0,0.2
2,3.2,0.2
3,3.1,0.2
4,3.6,0.2


In [51]:
df.iloc[:10, [1, 3]]

Unnamed: 0,sepal_width,petal_width
0,3.5,0.2
1,3.0,0.2
2,3.2,0.2
3,3.1,0.2
4,3.6,0.2
5,3.9,0.4
6,3.4,0.3
7,3.4,0.2
8,2.9,0.2
9,3.1,0.1


### 5. Missing Values

#### 5.1 NULL Values

In [59]:
data = {
    'roll_no': [3, 2, 7, 11],
    'ppr_id': [34, 21, 10, 11], 
    'marks': [np.nan, 23, 17, 27]
}

In [61]:
df1 = pd.DataFrame(data)
df1

Unnamed: 0,roll_no,ppr_id,marks
0,3,34,
1,2,21,23.0
2,7,10,17.0
3,11,11,27.0


#### 5.2 isnull()

In [64]:
# Detection of NULL values

df1.isnull()

Unnamed: 0,roll_no,ppr_id,marks
0,False,False,True
1,False,False,False
2,False,False,False
3,False,False,False


In [68]:
# No. of NULL values

df1.isnull().sum()

roll_no    0
ppr_id     0
marks      1
dtype: int64

#### 5.3 fillna()

In [27]:
# Filling NULL values
df2 = df1.fillna(20)
df2

Unnamed: 0,roll_no,ppr_id,marks
0,3,34,20.0
1,2,21,23.0
2,7,10,17.0
3,11,11,27.0


#### 5.4 Dropping NULL values

In [28]:
df1

Unnamed: 0,roll_no,ppr_id,marks
0,3,34,
1,2,21,23.0
2,7,10,17.0
3,11,11,27.0


In [29]:
# Dropping NULL values

a = df1.dropna()
a

Unnamed: 0,roll_no,ppr_id,marks
1,2,21,23.0
2,7,10,17.0
3,11,11,27.0


In [70]:
df1

Unnamed: 0,roll_no,ppr_id,marks
0,3,34,
1,2,21,23.0
2,7,10,17.0
3,11,11,27.0


In [72]:
a = df1.dropna(axis = 0)
a

Unnamed: 0,roll_no,ppr_id,marks
1,2,21,23.0
2,7,10,17.0
3,11,11,27.0


In [74]:
a = df1.dropna(axis = 1)
a

Unnamed: 0,roll_no,ppr_id
0,3,34
1,2,21
2,7,10
3,11,11


#### 5.5 Creating a data with non NULL values

In [81]:
a = pd.notnull(df1['marks'])
a

0    False
1     True
2     True
3     True
Name: marks, dtype: bool

In [79]:
df1[a]

Unnamed: 0,roll_no,ppr_id,marks
1,2,21,23.0
2,7,10,17.0
3,11,11,27.0


### 6 Statistics

In [87]:
data = {
    'roll_no': [3, 3, 7, 11],
    'ppr_id': [23, np.nan, 10, 11], 
    'marks': [30, 22, 17, 27]
}

df1 = pd.DataFrame(data)

In [89]:
df1

Unnamed: 0,roll_no,ppr_id,marks
0,3,23.0,30
1,3,,22
2,7,10.0,17
3,11,11.0,27


In [91]:
# Total Marks
df1['marks'].sum()

96

In [93]:
# Average Marks
df1['marks'].mean()

24.0

In [95]:
# Cummulative Sum
df1['marks'].cumsum()

0    30
1    52
2    69
3    96
Name: marks, dtype: int64

In [97]:
# count

df1['marks'].count()

4

In [99]:
df1['marks'].min()

17

In [101]:
df1['ppr_id'].min()

10.0

In [103]:
df1['marks'].max()

30

In [105]:
df1['marks'].var()

32.666666666666664

In [107]:
df1['marks'].std()

5.715476066494082

In [109]:
df1.corr()

Unnamed: 0,roll_no,ppr_id,marks
roll_no,1.0,-0.829396,-0.060914
ppr_id,-0.829396,1.0,0.727698
marks,-0.060914,0.727698,1.0
