<a href="https://colab.research.google.com/github/engmohamedsalah/AIMaster-Training/blob/master/DataPrep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Reading
Objective: Implement a function that can detect the type and read data from different file formats such as CSV, Excel, and JSON given the file path only.
Tools: Use Pandas for efficient data importing.

In [None]:
import pandas as pd

def read_data(file_path):
  """
  Detects the file format and reads data into a Pandas DataFrame.

  Args:
    file_path: The path to the file.

  Returns:
    A Pandas DataFrame containing the data from the file.
  """

  try:
    if file_path.endswith('.csv'):
      df = pd.read_csv(file_path)
    elif file_path.endswith('.xlsx') or file_path.endswith('.xls'):
      df = pd.read_excel(file_path)
    elif file_path.endswith('.json'):
      df = pd.read_json(file_path)
    else:
      raise ValueError("Unsupported file format.")
    return df
  except Exception as e:
    print(f"Error reading data from file: {e}")
    return None

# Data Summary
Objective: Create a function to print key statistical summaries of the data, including metrics like the average and most frequent values.
Tools: Utilize NumPy and Pandas to generate these summaries.

In [None]:

import numpy as np

def summarize_data(df):
  """
  Prints key statistical summaries of the DataFrame.

  Args:
    df: The Pandas DataFrame to summarize.
  """
  if df is not None:
    print("Data Summary:")
    for column in df.columns:
      if pd.api.types.is_numeric_dtype(df[column]):
        print(f"\nColumn: {column}")
        print(f"  Average: {np.mean(df[column])}")
        print(f"  Median: {np.median(df[column])}")
        print(f"  Standard Deviation: {np.std(df[column])}")
      else:
        print(f"\nColumn: {column}")
        print(f"  Most Frequent Value: {df[column].mode()[0]}")
        print(f"  Unique Values: {df[column].nunique()}")


# Handling misssing values
Objective: Create a function for addressing missing values, offering solutions to either remove or impute them based on set strategies.
Tools: Employ methods that ensure data integrity.

In [None]:

def handle_missing_values(df, strategy='remove'):
  """
  Handles missing values in the DataFrame.

  Args:
    df: The Pandas DataFrame.
    strategy: The strategy for handling missing values, either 'remove' or 'impute'.

  Returns:
    The DataFrame with missing values handled.
  """
  if df is not None:
    if strategy == 'remove':
      df = df.dropna()
    elif strategy == 'impute':
      for column in df.columns:
        if pd.api.types.is_numeric_dtype(df[column]):
          df[column].fillna(df[column].mean(), inplace=True)
        else:
          df[column].fillna(df[column].mode()[0], inplace=True)
    else:
      print("Invalid strategy. Choose 'remove' or 'impute'.")
    return df
  else:
    return None


# Categorical Data encoding
Objective: Design functions for encoding categorical data, allowing their conversion into numerical formats for analysis.
Tools: Implement encoding techniques effectively.

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

def encode_categorical_data(df, method='label'):
  """
  Encodes categorical data in the DataFrame.

  Args:
    df: The Pandas DataFrame.
    method: The encoding method, either 'label' or 'onehot'.

  Returns:
    The DataFrame with encoded categorical data.
  """
  if df is not None:
    if method == 'label':
      label_encoder = LabelEncoder()
      for column in df.columns:
        if pd.api.types.is_string_dtype(df[column]):
          df[column] = label_encoder.fit_transform(df[column])
    elif method == 'onehot':
      onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
      categorical_columns = df.select_dtypes(include=['object', 'category']).columns
      encoded_data = onehot_encoder.fit_transform(df[categorical_columns])
      encoded_df = pd.DataFrame(encoded_data, columns=onehot_encoder.get_feature_names_out(categorical_columns))
      df = df.drop(categorical_columns, axis=1)
      df = pd.concat([df, encoded_df], axis=1)
    else:
      print("Invalid encoding method. Choose 'label' or 'onehot'.")
    return df
  else:
    return None


# Wrapping all function into class

In [None]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder


class DataProcessor:
    def __init__(self):
        pass

    def read_data(self, file_path):
        """
        Detects the file format and reads data into a Pandas DataFrame.

        Args:
          file_path: The path to the file.

        Returns:
          A Pandas DataFrame containing the data from the file.
        """

        try:
            if file_path.endswith('.csv'):
                df = pd.read_csv(file_path)
            elif file_path.endswith('.xlsx') or file_path.endswith('.xls'):
                df = pd.read_excel(file_path)
            elif file_path.endswith('.json'):
                df = pd.read_json(file_path)
            else:
                raise ValueError("Unsupported file format.")
            return df
        except Exception as e:
            print(f"Error reading data from file: {e}")
            return None


    def summarize_data(self, df):
        """
        Prints key statistical summaries of the DataFrame.

        Args:
          df: The Pandas DataFrame to summarize.
        """
        if df is not None:
            print("Data Summary:")
            for column in df.columns:
                if pd.api.types.is_numeric_dtype(df[column]):
                    print(f"\nColumn: {column}")
                    print(f"  Average: {np.mean(df[column])}")
                    print(f"  Median: {np.median(df[column])}")
                    print(f"  Standard Deviation: {np.std(df[column])}")
                else:
                    print(f"\nColumn: {column}")
                    print(f"  Most Frequent Value: {df[column].mode()[0]}")
                    print(f"  Unique Values: {df[column].nunique()}")


    def handle_missing_values(self, df, strategy='remove'):
        """
        Handles missing values in the DataFrame.

        Args:
          df: The Pandas DataFrame.
          strategy: The strategy for handling missing values, either 'remove' or 'impute'.

        Returns:
          The DataFrame with missing values handled.
        """
        if df is not None:
            if strategy == 'remove':
                df = df.dropna()
            elif strategy == 'impute':
                for column in df.columns:
                    if pd.api.types.is_numeric_dtype(df[column]):
                        df[column].fillna(df[column].mean(), inplace=True)
                    else:
                        df[column].fillna(df[column].mode()[0], inplace=True)
            else:
                print("Invalid strategy. Choose 'remove' or 'impute'.")
            return df
        else:
            return None


    def encode_categorical_data(self, df, method='label'):
        """
        Encodes categorical data in the DataFrame.

        Args:
          df: The Pandas DataFrame.
          method: The encoding method, either 'label' or 'onehot'.

        Returns:
          The DataFrame with encoded categorical data.
        """
        if df is not None:
            if method == 'label':
                label_encoder = LabelEncoder()
                for column in df.columns:
                    if pd.api.types.is_string_dtype(df[column]):
                        df[column] = label_encoder.fit_transform(df[column])
            elif method == 'onehot':
                onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
                categorical_columns = df.select_dtypes(include=['object', 'category']).columns
                encoded_data = onehot_encoder.fit_transform(df[categorical_columns])
                encoded_df = pd.DataFrame(encoded_data, columns=onehot_encoder.get_feature_names_out(categorical_columns))
                df = df.drop(categorical_columns, axis=1)
                df = pd.concat([df, encoded_df], axis=1)
            else:
                print("Invalid encoding method. Choose 'label' or 'onehot'.")
            return df
        else:
            return None


In [None]:
# prompt: Objective: provide example of how to use your class , take object , pass the data and test every function inside it .

# Instantiate the DataProcessor class
processor = DataProcessor()

# Example file path (replace with your actual file path)
file_path = 'sample_data/california_housing_train.csv'

# Read data from the file
df = processor.read_data(file_path)

# Print some information about the DataFrame
if df is not None:
    print(f"DataFrame shape: {df.shape}")
    print(df.head())

    # Summarize the data
    processor.summarize_data(df)

    # Handle missing values (if any)
    df_no_missing = processor.handle_missing_values(df, strategy='impute')

    # Encode categorical data (if any)
    # Assuming 'ocean_proximity' is a categorical column in your dataset
    if 'ocean_proximity' in df_no_missing.columns:
        df_encoded = processor.encode_categorical_data(df_no_missing, method='label')
        print("\nDataFrame with encoded categorical data:")
        print(df_encoded.head())

    else:
        print("No categorical columns to encode.")
