In [23]:
import pandas as pd
from unidecode import unidecode
import re
from tqdm import tqdm

In [24]:
file_path = 'netflix.csv'
data = pd.read_csv(file_path)

In [25]:
# Function to clean text columns
def clean_text(text):
    return unidecode(str(text))

# Function to validate date format
def valid_date(date):
    date_regex = r'\b(0?[1-9]|1[0-2])/(0?[1-9]|[12][0-9]|3[01])/(19|20)\d{2}\b'  # Define date regex
    return re.match(date_regex, str(date)) is not None

# Function to validate year format
def valid_year(year):
    return str(year).isdigit() and len(str(year)) == 4

In [26]:
# Applying text cleaning using tqdm for progress monitoring
columns_to_clean = ['show_id', 'type', 'title', 'director', 'country', 'listed_in']
for column in columns_to_clean:
    with tqdm(total=len(data[column]), desc=f"Cleaning {column} Column") as pbar:
        for i, value in enumerate(data[column]):
            data.at[i, column] = clean_text(value)
            pbar.update(1)

# Validating 'duration', 'release_year', 'date_added', and 'rating' columns
columns_to_validate = ['duration', 'release_year', 'date_added', 'rating']
for column in columns_to_validate:
    with tqdm(total=len(data[column]), desc=f"Validating {column} Column") as pbar:
        if column == 'duration':
            for i, value in enumerate(data[column]):
                if not all(char.isdigit() or char.isspace() for char in str(value)):
                    data.at[i, column] = clean_text(value)
                pbar.update(1)
        elif column == 'release_year':
            for i, value in enumerate(data[column]):
                if not valid_year(str(value)):
                    data.at[i, column] = clean_text(value)
                pbar.update(1)
        elif column == 'date_added':
            for i, value in enumerate(data[column]):
                if not valid_date(value):
                    data.at[i, column] = clean_text(value)
                pbar.update(1)
        elif column == 'rating':
            for i, value in enumerate(data[column]):
                if not all(char.isdigit() or char in ['.', '-', '+'] for char in str(value)):
                    data.at[i, column] = clean_text(value)
                pbar.update(1)

Cleaning show_id Column: 100%|██████████| 8790/8790 [00:00<00:00, 58237.01it/s]
Cleaning type Column: 100%|██████████| 8790/8790 [00:00<00:00, 67770.69it/s]
Cleaning title Column: 100%|██████████| 8790/8790 [00:00<00:00, 59233.87it/s]
Cleaning director Column: 100%|██████████| 8790/8790 [00:00<00:00, 62378.49it/s]
Cleaning country Column: 100%|██████████| 8790/8790 [00:00<00:00, 62300.49it/s]
Cleaning listed_in Column: 100%|██████████| 8790/8790 [00:00<00:00, 63717.88it/s]
Validating duration Column: 100%|██████████| 8790/8790 [00:00<00:00, 57282.68it/s]
Validating release_year Column: 100%|██████████| 8790/8790 [00:00<00:00, 878854.16it/s]
Validating date_added Column: 100%|██████████| 8790/8790 [00:00<00:00, 474924.73it/s]
Validating rating Column: 100%|██████████| 8790/8790 [00:00<00:00, 54036.45it/s]


In [27]:
# Comment this whole cell to ignore the Not Given text in different rows of different columns
variations_to_remove = ['', 'not given', 'notgiven', 'Not Given']
data = data[~data['country'].apply(lambda x: x.strip().lower() in variations_to_remove)]
data = data[~data['director'].apply(lambda x: x.strip().lower() in variations_to_remove)]


In [28]:
# Save the cleaned data to a new CSV file
cleaned_file_path = 'cleaned_data.csv'
data.to_csv(cleaned_file_path, index=False)