In [1]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

data = {
    'Name':['Ujwal','Singh','Ayush','Agrahari'],
    'Age':[21,21,None,29],
    'Salary':[70000,None,None,50000]
}
df = pd.DataFrame(data)
print(df)

       Name   Age   Salary
0     Ujwal  21.0  70000.0
1     Singh  21.0      NaN
2     Ayush   NaN      NaN
3  Agrahari  29.0  50000.0


In [None]:
df.dropna()

Unnamed: 0,Name,Gender,Passed,City
0,Aman,male,yes,Mumbai
1,Priya,female,yes,Banglore
2,Rahul,male,no,Delhi
3,Anjali,female,yes,Mumbai
4,Ravi,male,yes,Banglore
5,Meera,female,no,Delhi
6,Arjun,male,yes,Mumbai
7,Neha,female,yes,Banglore
8,Imran,male,no,Delhi
9,Sneha,female,yes,Mumbai


In [3]:
n = df.isnull().sum() # for number of missing places
print(n)

Name      0
Age       1
Salary    2
dtype: int64


In [4]:
print(df.isnull().mean() * 100) # for missing percentage

Name       0.0
Age       25.0
Salary    50.0
dtype: float64


In [5]:
df.fillna({'Age': df['Age'].mean()}, inplace = True)
df.fillna({'Salary': df['Salary'].mean()}, inplace = True)
print(df)

       Name        Age   Salary
0     Ujwal  21.000000  70000.0
1     Singh  21.000000  60000.0
2     Ayush  23.666667  60000.0
3  Agrahari  29.000000  50000.0


In [None]:
# another example
data = {
    'Name':['Aman','Priya','Rahul','Anjali','Ravi','Meera','Arjun','Neha','Imran','Sneha','Raj'],
    'Gender': ['male','female','male','female','male','female','male','female','male','female','male'],
    'Passed':['yes','yes','no','yes','yes','no','yes','yes','no','yes','yes'],
    'City': ['Mumbai','Banglore','Delhi','Mumbai','Banglore','Delhi','Mumbai','Banglore','Delhi','Mumbai','Banglore']
}

df = pd.DataFrame(data)
df.to_csv('Trial.csv', index=False)
print("CSV file created successfully")

CSV file created successfully


In [10]:
df = pd.read_csv('Trial.csv')

df_copy = df.copy()
le = LabelEncoder()

df_copy['Gender_encoded'] = le.fit_transform(df_copy['Gender'])
df_copy['Passed_encoded'] = le.fit_transform(df_copy['Passed'])
# print('*'*50)
# print('\nLabel Encoded Data')
# print(df_copy[['Name','Gender','Gender_encoded','Passed', 'Passed_encoded']].head())

df_encoded = pd.get_dummies(df_copy, columns=['City'], dtype=int) # dtype=int -> by default it returns Boolean values (True/False), to change its default value to binary number (0/1)
print('*'*50)
print('\nOne-Hot Encoded (City)')
print(df_encoded.head())

print(f'Without Encoding Data: \n{df.head()}\n') # cross verify them - one-hot encoded values

**************************************************

One-Hot Encoded (City)
     Name  Gender Passed  Gender_encoded  Passed_encoded  City_Banglore  \
0    Aman    male    yes               1               1              0   
1   Priya  female    yes               0               1              1   
2   Rahul    male     no               1               0              0   
3  Anjali  female    yes               0               1              0   
4    Ravi    male    yes               1               1              1   

   City_Delhi  City_Mumbai  
0           0            1  
1           0            0  
2           1            0  
3           0            1  
4           0            0  
Without Encoding Data: 
     Name  Gender Passed      City
0    Aman    male    yes    Mumbai
1   Priya  female    yes  Banglore
2   Rahul    male     no     Delhi
3  Anjali  female    yes    Mumbai
4    Ravi    male    yes  Banglore

