In [1]:
import numpy as np
import pandas as pd


In [20]:
data = {
    'A' : [1 , 2 , np.nan, 4 , 5],
    'B' : [7, 2, 3, 4, 5],
    'C' : [1, 2, 3, np.nan, np. nan],
    'D' : [1, np.nan, np.nan, np.nan, 5]
}
df = pd.DataFrame(data)

In [21]:
df


Unnamed: 0,A,B,C,D
0,1.0,7,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


In [22]:
df.isna()

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,False,False,True
2,True,False,False,True
3,False,False,True,True
4,False,False,True,False


In [23]:
df.isna().sum()

A    1
B    0
C    2
D    3
dtype: int64

In [24]:
df.isna().any()

A     True
B    False
C     True
D     True
dtype: bool

Removing Missing Data - works base on rows

In [25]:
df

Unnamed: 0,A,B,C,D
0,1.0,7,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


In [26]:
df.dropna()

Unnamed: 0,A,B,C,D
0,1.0,7,1.0,1.0


In [28]:
df.dropna(thresh =3 )    # give rows which have atleast 3 non null values 

Unnamed: 0,A,B,C,D
0,1.0,7,1.0,1.0
1,2.0,2,2.0,
4,5.0,5,,5.0


Filling the missing data

In [29]:
df

Unnamed: 0,A,B,C,D
0,1.0,7,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


In [30]:
df.fillna(0)

Unnamed: 0,A,B,C,D
0,1.0,7,1.0,1.0
1,2.0,2,2.0,0.0
2,0.0,3,3.0,0.0
3,4.0,4,0.0,0.0
4,5.0,5,0.0,5.0


In [31]:
values = {'A':0, 'B':100, 'C':300 , 'D': 400}
df.fillna(value = values)               # column wise

Unnamed: 0,A,B,C,D
0,1.0,7,1.0,1.0
1,2.0,2,2.0,400.0
2,0.0,3,3.0,400.0
3,4.0,4,300.0,400.0
4,5.0,5,300.0,5.0


In [32]:
df

Unnamed: 0,A,B,C,D
0,1.0,7,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


In [33]:
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,1.0,7,1.0,1.0
1,2.0,2,2.0,3.0
2,3.0,3,3.0,3.0
3,4.0,4,2.0,3.0
4,5.0,5,2.0,5.0


# Merging , joining and concatenation

In [36]:
import numpy as np
import pandas as pd

In [37]:
employee_data = pd.DataFrame({
    'Employee_ID': [101, 102, 103, 104],
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Department': ['HR', 'IT', 'Sales', 'Marketing'],
})
salary_data = pd.DataFrame({
    'Employee_ID': [101, 102, 103, 104, 105],
    'Salary': [60000, 95000, 70000, 105000, 62000],
})

In [38]:
employee_data

Unnamed: 0,Employee_ID,Name,Department
0,101,Alice,HR
1,102,Bob,IT
2,103,Charlie,Sales
3,104,David,Marketing


In [39]:
salary_data 

Unnamed: 0,Employee_ID,Salary
0,101,60000
1,102,95000
2,103,70000
3,104,105000
4,105,62000


In [41]:
pd.merge(employee_data,salary_data , on = 'Employee_ID', how = 'inner')

Unnamed: 0,Employee_ID,Name,Department,Salary
0,101,Alice,HR,60000
1,102,Bob,IT,95000
2,103,Charlie,Sales,70000
3,104,David,Marketing,105000


In [42]:
pd.merge(employee_data,salary_data , on = 'Employee_ID', how = 'outer')

Unnamed: 0,Employee_ID,Name,Department,Salary
0,101,Alice,HR,60000
1,102,Bob,IT,95000
2,103,Charlie,Sales,70000
3,104,David,Marketing,105000
4,105,,,62000


In [43]:
pd.merge(employee_data,salary_data , on = 'Employee_ID', how = 'left')

Unnamed: 0,Employee_ID,Name,Department,Salary
0,101,Alice,HR,60000
1,102,Bob,IT,95000
2,103,Charlie,Sales,70000
3,104,David,Marketing,105000


In [44]:
pd.merge(employee_data,salary_data , on = 'Employee_ID', how = 'right')

Unnamed: 0,Employee_ID,Name,Department,Salary
0,101,Alice,HR,60000
1,102,Bob,IT,95000
2,103,Charlie,Sales,70000
3,104,David,Marketing,105000
4,105,,,62000


Concatenation

In [45]:
pd.concat([employee_data,salary_data ], axis = 1)

Unnamed: 0,Employee_ID,Name,Department,Employee_ID.1,Salary
0,101.0,Alice,HR,101,60000
1,102.0,Bob,IT,102,95000
2,103.0,Charlie,Sales,103,70000
3,104.0,David,Marketing,104,105000
4,,,,105,62000


Joining 2 data frames

In [2]:
import numpy as np
import pandas as pd

In [3]:
df1 = pd.DataFrame({
    'name' : ['Alice', 'Bob', 'Charlie']
} , index = [1,2,3])

df2 = pd.DataFrame({
    'score' : [85, 90, 75]
} , index = [2,3,4])

In [6]:
df2.join(df1)

Unnamed: 0,score,name
2,85,Bob
3,90,Charlie
4,75,


In [7]:
df1.join(df2, how= 'outer')

Unnamed: 0,name,score
1,Alice,
2,Bob,85.0
3,Charlie,90.0
4,,75.0
