In [None]:
import pandas as pd
import numpy as np

def load_csv(file_path):
    """Load the CSV file into a DataFrame."""
    return pd.read_csv(file_path)

def handle_missing_values(df):
    """Handle missing values in the DataFrame."""
    # Drop columns with more than 50% missing values
    df = df.dropna(thresh=len(df) * 0.5, axis=1)
    
    # Fill numerical columns with the mean
    for col in df.select_dtypes(include=np.number).columns:
        df[col].fillna(df[col].mean(), inplace=True)
    
    # Fill categorical columns with the mode
    for col in df.select_dtypes(include='object').columns:
        df[col].fillna(df[col].mode()[0], inplace=True)
    
    return df

def remove_duplicates(df):
    """Remove duplicate rows from the DataFrame."""
    return df.drop_duplicates()

def standardize_data(df):
    """Standardize the data in the DataFrame."""
    # Convert all column names to lowercase
    df.columns = df.columns.str.lower()
    
    # Strip leading and trailing whitespace from string columns
    for col in df.select_dtypes(include='object').columns:
        df[col] = df[col].str.strip()
    
    return df

def convert_data_types(df):
    """Convert data types of columns."""
    # Example: Convert a column to datetime
    if 'date' in df.columns:
        df['date'] = pd.to_datetime(df['date'], errors='coerce')
    
    return df

def handle_outliers(df):
    """Handle outliers in the DataFrame."""
    # Example: Remove outliers using the IQR method for numerical columns
    for col in df.select_dtypes(include=np.number).columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        df = df[~((df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR)))]
    
    return df

def save_csv(df, file_path):
    """Save the cleaned DataFrame to a new CSV file."""
    df.to_csv(file_path, index=False)

def main(input_file, output_file):
    """Main function to load, clean, and save the CSV data."""
    df = load_csv(input_file)
    df = handle_missing_values(df)
    df = remove_duplicates(df)
    df = standardize_data(df)
    df = convert_data_types(df)
    df = handle_outliers(df)
    save_csv(df, output_file)

if __name__ == "__main__":
    input_file = 'input.csv'  # Path to your input CSV file
    output_file = 'cleaned_data.csv'  # Path to save the cleaned CSV file
    main(input_file, output_file)


To check if a CSV file has no column names, you can load the first row of the file and inspect its content. If the content of the first row looks like data rather than column names, then you can conclude that the CSV file has no column names.

Here is a function in Python that checks if a CSV file has column names using the Pandas library:

In [None]:
import pandas as pd

def has_column_names(file_path):
    """
    Check if the first row of the CSV file contains column names.
    
    Parameters:
    - file_path (str): The path to the CSV file.
    
    Returns:
    - bool: True if the CSV file has column names, False otherwise.
    """
    # Load the first row of the CSV file
    first_row = pd.read_csv(file_path, nrows=1)
    
    # Check if the first row contains unique column names
    column_names = first_row.columns
    unique_column_names = set(column_names)
    
    # If the number of unique column names is less than the number of columns,
    # it is likely that the first row contains data rather than column names
    if len(unique_column_names) < len(column_names):
        return False
    
    # If the column names contain typical data values (e.g., numbers),
    # it is also likely that the first row contains data rather than column names
    for name in column_names:
        try:
            float(name)
            # If a column name can be converted to a number, it's likely not a header
            return False
        except ValueError:
            continue
    
    return True

# Usage example
file_path = 'your_file.csv'
if has_column_names(file_path):
    print("The CSV file has column names.")
else:
    print("The CSV file does not have column names.")


Handling a CSV file that has row names (i.e., an index column) and potentially no column names requires careful reading and manipulation of the data. Hereâ€™s how you can handle such a file using Python's Pandas library.

Steps:
Read the CSV File: Read the file into a DataFrame, specifying whether the first column should be used as the row index.
Check for Column Names: Check if the first row contains column names or data.
Process Accordingly: Adjust the DataFrame based on whether column names are present or not.
Example Code
Here is a step-by-step example to handle a CSV file with row names and potentially no column names:

import pandas as pd

def read_csv_with_row_names(file_path, index_col=0):
    """
    Read a CSV file with row names and check if it has column names.
    
    Parameters:
    - file_path (str): The path to the CSV file.
    - index_col (int or str): Column to use as the row labels of the DataFrame.
    
    Returns:
    - pd.DataFrame: DataFrame with appropriate handling of row names and column names.
    """
    # Load the first row to check if it contains column names
    first_row = pd.read_csv(file_path, nrows=1, index_col=index_col)
    
    # Check if the first row contains unique column names
    column_names = first_row.columns
    unique_column_names = set(column_names)
    
    has_column_names = True
    
    # If the number of unique column names is less than the number of columns,
    # it is likely that the first row contains data rather than column names
    if len(unique_column_names) < len(column_names):
        has_column_names = False
    else:
        # Check if column names look like typical data values (e.g., numbers)
        for name in column_names:
            try:
                float(name)
                # If a column name can be converted to a number, it's likely not a header
                has_column_names = False
                break
            except ValueError:
                continue
    
    # Read the CSV file again, this time with the correct header setting
    if has_column_names:
        df = pd.read_csv(file_path, index_col=index_col)
    else:
        df = pd.read_csv(file_path, header=None, index_col=index_col)
        # Generate default column names since there were none in the file
        df.columns = [f'Column_{i}' for i in range(len(df.columns))]
    
    return df

# Usage example
file_path = 'your_file.csv'
df = read_csv_with_row_names(file_path)
print(df)


In [2]:
import pandas as pd

data = pd.read_csv('/teamspace/studios/this_studio/data/csv-clean/sample-synthetic-healthcare.csv')
data.shape

(9999, 18)