<a href="https://colab.research.google.com/github/dubeyshubham786/Python_libraries/blob/main/Pandas_9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cleaning Data - Casting Datatypes and Handling Missing Values

In [3]:
import pandas as pd
import numpy as np

In [14]:
people = {
    'first': ['Corey', 'Jane', 'John', 'Chris', np.nan, None, 'NA'], 
    'last': ['Schafer', 'Doe', 'Doe', 'Schafer', np.nan, np.nan, 'Missing'], 
    'email': ['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com', None, np.nan, 'Anonymous@email.com', 'NA'],
    'age': ['33', '55', '63', '36', None, None, 'Missing']
}

In [15]:
df = pd.DataFrame(people)

In [16]:
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [17]:
# We can drop nan values by dropna function

df.dropna()

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
6,,Missing,,Missing


In [19]:
# This method uses this default arguement
# this will give the same output.

df.dropna(axis='index', how='any')

# The axis arguement can be set to either index or column
# It will drop nan values in a rows if it is set to index
# If we set it to columns then it will drop nan values from column if they have any.
# Here it will drop only those rows which have any missing values

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
6,,Missing,,Missing


In [20]:
# We can set it to how to all so that it will gives us rows where all values are not missing values.
# Here we have drop rows where we have all values as a missing values.
# If we set index to columns then also we get the same output
# Because we don't have any missing values in columns.

df.dropna(axis='index', how='all')


Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [21]:
# If we want to drop only missing values from a particular rows 
# Let's say if there is missing values in email columns then we want to drop that row.
# If there is missing values in first, last and age columns then we have to keep those.
# For that we need to pass the subset arguement.

df.dropna(axis='index', how='all', subset=['email'])


Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [22]:
# We can also pass multiple values 
# like we need either last name or email address but we don't need both

df.dropna(axis='index', how='all', subset=['last', 'email'])

# As long as the last name and email is there it shouldn't drop those rows.

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [25]:
# We can see that there is a string of Missing values 
# We can replace this values by proper numpy NaN values 

df.replace('NA', np.nan, inplace=True)
df.replace('Missing', np.nan, inplace=True)

# Here we have replace 'NA' and 'Missing' values as np.nan

In [26]:
df

# We can see that now we don't have any string with na or missing values

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


In [27]:
# We can also use isna method to find which are Nan values and which is not

df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


In [28]:
# Sometimes we have to fill NaN values with certain values
# For example fill all the NaN values with MISSING values

df.fillna('MISSING')

# It will make more sense with numerical data

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,MISSING,36
4,MISSING,MISSING,MISSING,MISSING
5,MISSING,MISSING,Anonymous@email.com,MISSING
6,MISSING,MISSING,MISSING,MISSING


In [31]:
# Now let's say we want average age but here we have string datatypes

df.dtypes

first    object
last     object
email    object
age      object
dtype: object

In [33]:
df['age'].mean()

TypeError: ignored

In [35]:
# we can do also get mean by assigning the age type as integer

df['age'] = df['age'].astype(int)

# It will work fine if there is no missing values in our dataframe
# It is causing error because NaN values is float

TypeError: ignored

In [36]:
type(np.nan)

float

In [38]:
# We can get our mean by converting missing values to zero
# But it is lot better to convert the age data to float

df['age'] = df['age'].astype(float)

In [40]:
# we can see the datatypes 

df.dtypes

# We can see that our age is a float object here

first     object
last      object
email     object
age      float64
dtype: object

In [42]:
# So now we can take average 

df['age'].mean()

46.75

In [42]:
# If you have an entire dataframe of numbers and something like that then
# We can also use astype like this df.astype() 
# And pass the columns values in it but we have a mix columns here so we won't do that.

## Now let's see on imdb dataset

In [2]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [43]:
df = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/Github/Pandas/imdb_data.csv')

In [50]:
df.head(2)

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,/tQtWuwvMf0hCc2QR2tkolwl7c3c.jpg,"[{'name': 'Paramount Pictures', 'id': 4}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,/w9Z7A0GHEhIp7etpj0vyKOeU1Wx.jpg,"[{'name': 'Walt Disney Pictures', 'id': 2}]","[{'iso_3166_1': 'US', 'name': 'United States o...",8/6/04,113.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...",95149435


In [57]:
df['runtime'].mean()

107.85657104736491

In [59]:
type('release_date')

str