In [6]:
import os
os.chdir("../")

In [7]:

from src.movie_predictor.constants import *

In [16]:
import os
from box.exceptions import BoxValueError
import yaml
from src.movie_predictor import logger
import json
import joblib
from ensure import ensure_annotations
from box import ConfigBox
from pathlib import Path
from typing import Any
import pandas as pd
import numpy as np

def read_yaml(path_to_yaml: Path) -> ConfigBox:
    """reads yaml file and returns
    Args:
        path_to_yaml (str): input is path
    Raises:
        ValueError: if yaml file is empty
        e: empty file
    Returns:
        ConfigBox: ConfigBox type
    """
    try:
        with open(path_to_yaml) as yaml_file:
            config_yaml = yaml.safe_load(yaml_file)
            logger.info(f"yaml file: {path_to_yaml} loaded successfully")
            return ConfigBox(config_yaml)
    except BoxValueError:
        raise ValueError("yaml file is empty")
    except Exception as e:
        raise e

@ensure_annotations
def create_directories(path_to_directories: list, verbose=True):
    """create list of directories
    Args:
        path_to_directories (list): list of path of directories
        ignore_log (bool, optional): ignore if multiple dirs is to be created. Defaults to False.
    """
    for path in path_to_directories:
        os.makedirs(path, exist_ok=True)
        if verbose:
            logger.info(f"created directory at: {path}")

@ensure_annotations
def save_json(path: Path, data: dict):
    """save json data
    Args:
        path (Path): path to json file
        data (dict): data to be saved in json file
    """
    with open(path, "w") as f:
        json.dump(data, f, indent=4)

    logger.info(f"json file saved at: {path}")

@ensure_annotations
def load_json(path: Path) -> ConfigBox:
    """load json files data
    Args:
        path (Path): path to json file
    Returns:
        ConfigBox: data as class attributes instead of dict
    """
    with open(path) as f:
        config_yaml = json.load(f)

    logger.info(f"json file loaded succesfully from: {path}")
    return ConfigBox(config_yaml)

@ensure_annotations
def save_object(file_path:str,obj):
    """
    file_path: str
    obj: Any sort of object
    """
    try:
        dir_path = os.path.dirname(file_path)
        os.makedirs(dir_path, exist_ok=True)
        with open(file_path, "wb") as file_obj:
            joblib.dump(obj, file_obj)
    except Exception as e:
        return e

@ensure_annotations
def load_bin(path: Path) -> Any:
    """load binary data
    Args:
        path (Path): path to binary file
    Returns:
        Any: object stored in the file
    """
    data = joblib.load(path)
    logger.info(f"binary file loaded from: {path}")
    return data


@ensure_annotations
def load_data(file_path: str, schema_file_path: str) -> pd.DataFrame:
    try:
        datatset_schema = read_yaml(schema_file_path)

        schema = datatset_schema[DATASET_SCHEMA_COLUMNS_KEY]

        dataframe = pd.read_csv(file_path)

        error_messgae = ""


        for column in dataframe.columns:
            if column in list(schema.keys()):
                dataframe[column].astype(schema[column])
            else:
                error_messgae = f"{error_messgae} \nColumn: [{column}] is not in the schema."
        if len(error_messgae) > 0:
            raise Exception(error_messgae)
        return dataframe

    except Exception as e:
        return e

@ensure_annotations
def save_numpy_array_data(file_path: Path, array: np.array):
    """
    Save numpy array data to file
    file_path: str location of file to save
    array: np.array data to save
    """
    try:
        dir_path = os.path.dirname(file_path)
        os.makedirs(dir_path, exist_ok=True)
        with open(file_path, 'wb') as file_obj:
            np.save(file_obj, array)
    except Exception as e:
        return e
        
@ensure_annotations
def get_size(path: Path) -> str:
    """get size in KB
    Args:
        path (Path): path of the file
    Returns:
        str: size in KB
    """
    size_in_kb = round(os.path.getsize(path)/1024)
    return f"~ {size_in_kb} KB"

In [9]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformConfig:
    root_dir: Path
    tranfored_train_dir: Path
    transormed_test_dir: Path
    preprocessed_file_path: Path

In [10]:

class ConfigurationManager:
    def __init__(
        self, 
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(DATA_VALIDATION_FILE)
        create_directories([self.config.artifacts_root])

    
    def get_data_transform_config(self) -> DataTransformConfig:
        config = self.config.data_transformation

        create_directories(config.root_dir)
        create_directories(config.transformed_train_dir)
        create_directories(config.transformed_test_dir)
        create_directories(config.preprocessed_dir)

        data_transform_config = DataTransformConfig(
            root_dir = config.root_dir,
            tranfored_train_dir = config.transformed_train_dir,
            transormed_test_dir = config.transformed_test_dir,
            preprocessed_file_path = config.preprocessed_object_file_name
        )
        return data_transform_config

In [11]:
from collections import namedtuple
DataIngestionArtifact = namedtuple("DataIngestionArtifact",
[ "train_file_path", "test_file_path", "is_ingested", "message"])

DataValidationArtifact = namedtuple("DataValidationArtifact",
[ "report_file_path","report_page_file_path","is_validated","message"])

In [12]:
DATASET_SCHEMA_COLUMNS_KEY = "columns"
NUMERICAL_COLUMN_KEY = "numerical_columns"
CATEGORICAL_COLUMN_KEY = "categorical_columns"
TARGET_COLUMN_KEY = "target_column"

In [14]:
datatset_schema = read_yaml(DATA_VALIDATION_FILE)
datatset_schema


ConfigBox({'columns': {'Unnamed': 'Integer', 'originalTitle': 'category', 'domestic_revenue': 'category', 'distributor': 'category', 'opening_revenue': 'category', 'opening_theaters': 'category', 'budget': 'category', 'MPAA': 'category', 'genres_x': 'category', 'release_days': 'float', 'tconst': 'category', 'titleType': 'category', 'isAdult': 'Float', 'startYear': 'Integer', 'runtimeMinutes': 'category', 'genres_y': 'category', 'averageRating': 'float', 'numVotes': 'float', 'ordering': 'float', 'nconst': 'category', 'category': 'category', 'job': 'category', 'characters': 'category', 'primaryName': 'category', 'birthYear': 'category', 'deathYear': 'catgory', 'primaryProfession': 'category', 'knownForTitles': 'categoy', 'world_revenue': 'category'}, 'numerical_columns': {'Unnamed': 'Integer', 'release_days': 'float', 'isAdult': 'Float', 'startYear': 'Integer', 'averageRating': 'float', 'numVotes': 'float', 'ordering': 'float'}, 'categorical_columns': {'originalTitle': 'category', 'domes

In [15]:
schema = datatset_schema[DATASET_SCHEMA_COLUMNS_KEY]
schema.keys()

dict_keys(['Unnamed', 'originalTitle', 'domestic_revenue', 'distributor', 'opening_revenue', 'opening_theaters', 'budget', 'MPAA', 'genres_x', 'release_days', 'tconst', 'titleType', 'isAdult', 'startYear', 'runtimeMinutes', 'genres_y', 'averageRating', 'numVotes', 'ordering', 'nconst', 'category', 'job', 'characters', 'primaryName', 'birthYear', 'deathYear', 'primaryProfession', 'knownForTitles', 'world_revenue'])

In [2]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# Example DataFrame
df = pd.DataFrame({
    'col1': ['1,000','\\N', '2,000$', '3', '4$', '5',np.NAN],
    'col2': ['6', '7$','\\N', '8,000','\\N', '9', '10'],
    'col3': ['11$', '12','\\N',np.NAN,'13', '14,000', '15']
})

# Define a custom transformer class to replace `,` and `$` characters and convert object data types to float for specific columns
class ReplaceCharsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X[self.columns] = X[self.columns].applymap(self._replace_chars)
        for col in self.columns:
            if X[col].dtype == 'O' or X[col].dtype == 'float':
                X[col].replace('\\N', np.nan, inplace=True)
                
            if col in ['col1','col2','col3','col4','col5','col']:
                X[col] = pd.to_numeric(X[col], errors='coerce')    
    
        return X
    
    def _replace_chars(self, cell):
        if isinstance(cell, str):
            cell_str = str(cell)
            cell_str = cell_str.replace(',', '').replace('$', '')
            return cell_str
        else:
            return cell

# Create a pipeline that applies the ReplaceCharsTransformer to the DataFrame for specific columns
pipeline = Pipeline([
    ('replace_chars', ReplaceCharsTransformer(columns=['col1','col2','col3'])),
    ('passthrough', 'passthrough')
])

# Apply the pipeline to the DataFrame
df = pipeline.fit_transform(df)

# Confirm the data type conversion
print(df)


     col1    col2     col3
0  1000.0     6.0     11.0
1     NaN     7.0     12.0
2  2000.0     NaN      NaN
3     3.0  8000.0      NaN
4     4.0     NaN     13.0
5     5.0     9.0  14000.0
6     NaN    10.0     15.0


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit

# Create the dataframe
df = pd.DataFrame({
    'col1': ['1,000','\\N', '2,000$', '3', '4$', '5',np.NAN],
    'col2': ['6', '7$','\\N', '8,000','\\N', '9', '10'],
    'col3': ['11$', '12','\\N',np.NAN,'13', '14,000', '15']
})

# Convert col1 and col2 to numeric values
df['col1'] = pd.to_numeric(df['col1'].str.replace('[^\d.]', ''), errors='coerce')
df['col2'] = pd.to_numeric(df['col2'].str.replace('[^\d.]', ''), errors='coerce')

# Convert col3 to numeric values and fill missing values with 0
df['col3'] = pd.to_numeric(df['col3'].str.replace('[^\d.]', ''), errors='coerce')
df['col3'].fillna(0, inplace=True)

# Create the StratifiedShuffleSplit object
split = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)

# Perform the stratified sampling
for train_index, test_index in split.split(df, df['col3']):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]

# Print the sampled dataframes
print(strat_train_set)
print(strat_test_set)


  df['col1'] = pd.to_numeric(df['col1'].str.replace('[^\d.]', ''), errors='coerce')
  df['col2'] = pd.to_numeric(df['col2'].str.replace('[^\d.]', ''), errors='coerce')
  df['col3'] = pd.to_numeric(df['col3'].str.replace('[^\d.]', ''), errors='coerce')


ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# Example DataFrame
df = pd.read_csv('main.csv')

# Define a custom transformer class to replace `,` and `$` characters and convert object data types to float for specific columns
class ReplaceCharsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X[self.columns] = X[self.columns].applymap(self._replace_chars)
        for col in self.columns:
            if X[col].dtype == 'O' or X[col].dtype == 'float':
                X[col].replace('\\N', np.nan, inplace=True)
                
            if col in ['budget','opening_theaters','world_revenue','runtimeMinutes']:
                X[col] = pd.to_numeric(X[col], errors='coerce')    
    
        return X
    
    def _replace_chars(self, cell):
        if isinstance(cell, str):
            cell_str = str(cell)
            cell_str = cell_str.replace(',', '').replace('$', '')
            return cell_str
        else:
            return cell

# Create a pipeline that applies the ReplaceCharsTransformer to the DataFrame for specific columns
pipeline = Pipeline([
    ('replace_chars', ReplaceCharsTransformer(columns=['budget','opening_theaters','world_revenue','runtimeMinutes','genres_y','job','characters','birthYear','deathYear','knownForTitles'])),
    ('passthrough', 'passthrough')
])

# Apply the pipeline to the DataFrame
df = pipeline.fit_transform(df)

# Confirm the data type conversion
df.head()


Unnamed: 0.1,Unnamed: 0,originalTitle,domestic_revenue,world_revenue,distributor,opening_revenue,opening_theaters,budget,MPAA,genres_x,...,ordering,nconst,category,job,characters,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,0,Super 30,"$2,269,878",24701637,Reliance Big Pictures,"$871,256",317.0,,,"Biography,Drama",...,9,nm0618898,producer,producer,,Sajid Nadiadwala,1966.0,,"producer,writer,director",tt7518786tt2372222tt8366590tt7721946
1,1,Ad Astra,"$50,188,370",127461872,Twentieth Century Fox,"$19,001,398",3460.0,90000000.0,PG-13,"Adventure,Drama,Mystery,Sci-Fi,Thriller",...,9,nm1250070,producer,producer,,Jeremy Kleiner,,,"producer,miscellaneous",tt2024544tt1020072tt4975722tt7125860
2,2,The Art of Self-Defense,"$2,410,914",2414269,Bleecker Street Media,"$114,374",7.0,,R,"Action,Comedy,Crime,Drama,Mystery,Thriller",...,9,nm3442546,producer,producer,,Stephanie Whonsetler,,,"production_manager,miscellaneous,producer",tt6269368tt10962368tt7339248tt4595186
3,3,Welcome to Marwen,"$10,763,520",13061491,Universal Pictures,"$2,354,205",1911.0,,PG-13,"Biography,Comedy,Drama,Fantasy,Romance",...,9,nm0823330,producer,producer,,Steve Starkey,,,"producer,assistant_director,editorial_department",tt0109830tt0118884tt1907668tt0162222
4,4,Welcome to Marwen,"$10,763,520",13061491,Universal Pictures,"$2,354,205",1911.0,,PG-13,"Biography,Comedy,Drama,Fantasy,Romance",...,9,nm0823330,producer,producer,,Steve Starkey,,,"producer,assistant_director,editorial_department",tt0109830tt0118884tt1907668tt0162222


In [2]:

def check_new_line_null(feature):
    if '\\N' in df.values :    
        if (df[feature] == '\\N').sum()>0:
            count_n = (df[feature] == '\\N').sum().sum()
            
            print(feature,count_n)

        
for feature in df.columns:
    check_new_line_null(feature)
    

In [34]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
class ReplaceCharsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X[self.columns] = X[self.columns].applymap(self._replace_chars)
        for col in self.columns:
            if X[col].dtype == 'O' or X[col].dtype == 'float':
                X[col].replace('\\N', np.nan, inplace=True)
                
            if col in ['budget','opening_theaters','world_revenue','runtimeMinutes']:
                X[col] = pd.to_numeric(X[col], errors='coerce')    
    
        return X
    
    def _replace_chars(self, cell):
        if isinstance(cell, str):
            cell_str = str(cell)
            cell_str = cell_str.replace(',', '').replace('$', '')
            return cell_str
        else:
            return cell



In [60]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

class FillMissingBudgets(TransformerMixin):
    def __init__(self, df3):
        self.d_p = dict(zip(df3['originalTitle'], df3['budget']))
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X['budget'] = X['budget'].fillna(X['originalTitle'].map(self.d_p))
        return X

class FillnaTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, column, dictionary):
        self.column = column
        self.dictionary = dictionary
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X[self.column] = X[self.column].fillna(X['tconst'].map(self.dictionary))
        return X

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import pandas as pd
class DataTransform:
    def __init__(self, data_transformation: DataTransformConfig,
                 data_ingestion_artifact: DataIngestionArtifact,
                 data_validation_artifact: DataValidationArtifact) -> None:
        try:
            
            self.data_transformation_config = data_transformation
            self.data_ingestion_artifact = data_ingestion_artifact
            self.data_validation_artifact = data_validation_artifact

        except Exception as e:
            return e
        
    

    def load_data(file_path: str) -> pd.DataFrame:
        try:
            # reading the schema_validation.yaml file
            datatset_schema = read_yaml(DATA_VALIDATION_FILE)

            # extracting the columns info from the schema file
            schema = datatset_schema[DATASET_SCHEMA_COLUMNS_KEY]

            # reading the dataset 
            dataframe = pd.read_csv(file_path)
            error_messgae = ""
            for column in dataframe.columns:
                if column in list(schema.keys()):
                    dataframe[column].astype(schema[column])
                else:
                    error_messgae = f"{error_messgae} \nColumn: [{column}] is not in the schema."
            if len(error_messgae) > 0:
                raise Exception(error_messgae)
            return dataframe
        except Exception as e:
            return e 
        
    
    def get_data_transformer_object(self,dataframe)->ColumnTransformer:
            try:
                # train_file_path = self.data_ingestion_artifact.train_file_path
                # test_file_path = self.data_ingestion_artifact.test_file_path
                
                datatset_schema = read_yaml(DATA_VALIDATION_FILE)
                df1 = pd.read_excel('data__.xlsx',sheet_name='runtime')
                df2 = pd.read_excel('data__.xlsx',sheet_name='opening_theaters')
                df3 = pd.read_excel('manual_data.xlsx')
                d = dict(zip(df2['tconst'], df2['opening_theaters']))
                p = dict(zip(df1['tconst'], df1['runtimeMinutes']))
                
                                
                # Create a pipeline that applies the ReplaceCharsTransformer to the DataFrame for specific columns
                conversion_pipeline = Pipeline([
                    ('replace_chars', ReplaceCharsTransformer(columns=['budget','opening_theaters','world_revenue','runtimeMinutes','genres_y','job','characters','birthYear','deathYear','knownForTitles'])),
                    ('passthrough', 'passthrough')
                ])
                mpaa_missing_pipeline = Pipeline([
                    ('imputer', SimpleImputer(strategy='most_frequent'))
                ])
                

                manual_nan_pipeline = Pipeline([
                    ('fillna_runtime', FillnaTransformer(column='runtimeMinutes', dictionary=d)),
                    ('fillna_opening', FillnaTransformer(column='opening_theaters', dictionary=p)),
                    ('fill_budgets', FillMissingBudgets(df3))
                ])

                df = conversion_pipeline.fit_transform(dataframe)
                df['MPAA'] = mpaa_missing_pipeline.fit_transform(df[['MPAA']])

                new_df = manual_nan_pipeline.fit_transform(df)

                return new_df


                


            
            except Exception as e:
                return e    

In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
class ReplaceCharsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X[self.columns] = X[self.columns].applymap(self._replace_chars)
        for col in self.columns:
            if X[col].dtype == 'O' or X[col].dtype == 'float':
                X[col].replace('\\N', np.nan, inplace=True)
                
            if col in ['budget','opening_theaters','world_revenue','runtimeMinutes']:
                X[col] = pd.to_numeric(X[col], errors='coerce')    
    
        return X
    
    def _replace_chars(self, cell):
        if isinstance(cell, str):
            cell_str = str(cell)
            cell_str = cell_str.replace(',', '').replace('$', '')
            return cell_str
        else:
            return cell



In [23]:
import pandas as pd
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

# Define a custom transformer that wraps the drop_duplicates method
class DropDuplicatesTransformer(TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.drop_duplicates()

pipeline_duplicate = Pipeline([
    ('remove_duplicates', DropDuplicatesTransformer())
])

# Apply the pipeline to the dataframe
df_clean = pipeline_duplicate.fit_transform(df)

# Print the result
df_clean.head()   

Unnamed: 0.1,Unnamed: 0,originalTitle,domestic_revenue,world_revenue,distributor,opening_revenue,opening_theaters,budget,MPAA,genres_x,...,ordering,nconst,category,job,characters,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,0,Super 30,"$2,269,878",24701637,Reliance Big Pictures,"$871,256",317.0,24701637.0,R,"Biography,Drama",...,9,nm0618898,producer,producer,,Sajid Nadiadwala,1966.0,,"producer,writer,director",tt7518786tt2372222tt8366590tt7721946
1,1,Ad Astra,"$50,188,370",127461872,Twentieth Century Fox,"$19,001,398",3460.0,90000000.0,PG-13,"Adventure,Drama,Mystery,Sci-Fi,Thriller",...,9,nm1250070,producer,producer,,Jeremy Kleiner,,,"producer,miscellaneous",tt2024544tt1020072tt4975722tt7125860
2,2,The Art of Self-Defense,"$2,410,914",2414269,Bleecker Street Media,"$114,374",7.0,2414269.0,R,"Action,Comedy,Crime,Drama,Mystery,Thriller",...,9,nm3442546,producer,producer,,Stephanie Whonsetler,,,"production_manager,miscellaneous,producer",tt6269368tt10962368tt7339248tt4595186
3,3,Welcome to Marwen,"$10,763,520",13061491,Universal Pictures,"$2,354,205",1911.0,13061491.0,PG-13,"Biography,Comedy,Drama,Fantasy,Romance",...,9,nm0823330,producer,producer,,Steve Starkey,,,"producer,assistant_director,editorial_department",tt0109830tt0118884tt1907668tt0162222
4,4,Welcome to Marwen,"$10,763,520",13061491,Universal Pictures,"$2,354,205",1911.0,13061491.0,PG-13,"Biography,Comedy,Drama,Fantasy,Romance",...,9,nm0823330,producer,producer,,Steve Starkey,,,"producer,assistant_director,editorial_department",tt0109830tt0118884tt1907668tt0162222


In [5]:
import pandas as pd
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

# Define a custom transformer that wraps the drop_duplicates method
class DropDuplicatesTransformer(TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.drop_duplicates()

# Create a sample dataframe with duplicate rows
data = {'Name': ['Alice', 'Bob', 'Charlie', 'Alice', 'David'],
        'Age': [25, 30, 35, 25, 40],
        'City': ['New York', 'Los Angeles', 'Chicago', 'New York', 'Chicago']}
df = pd.DataFrame(data)

# Define the pipeline with the custom transformer
pipeline = Pipeline([
    ('remove_duplicates', DropDuplicatesTransformer())
])

# Apply the pipeline to the dataframe
df_clean = pipeline.fit_transform(df)

# Print the result
print(df_clean)


      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago
4    David   40      Chicago


In [2]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
class DropDuplicatesTransformer(TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.drop_duplicates()
class FillMissingBudgets(TransformerMixin):
    def __init__(self, df3):
        self.d_p = dict(zip(df3['originalTitle'], df3['budget']))
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X['budget'] = X['budget'].fillna(X['originalTitle'].map(self.d_p))
        return X



class FillnaTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, column, dictionary):
        self.column = column
        self.dictionary = dictionary
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X[self.column] = X[self.column].fillna(X['tconst'].map(self.dictionary))
        return X    


class DropColumnsTransformer(TransformerMixin):
    """Custom transformer to drop columns from a Pandas DataFrame."""
    
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop
        
    def transform(self, X):
        X = X.drop(columns=self.columns_to_drop)
        return X
    
    def fit(self, X, y=None):
        return self
    
# Dictionaries to fill null values
df3 = pd.read_excel('budget.xlsx')
df1 = pd.read_excel('combine.xlsx',sheet_name='runtime')
df2 = pd.read_excel('combine.xlsx',sheet_name='opening_theaters')
d = dict(zip(df2['tconst'], df2['opening_theaters']))
p = dict(zip(df1['tconst'], df1['runtimeMinutes']))

# Define a custom transformer class to fill null values based on a dictionary
# Create a pipeline that applies the FillnaTransformer to fill null values in two columns
# Create a pipeline that applies the ReplaceCharsTransformer to the DataFrame for specific columns
pipeline_duplicate = Pipeline([
    ('remove_duplicates', DropDuplicatesTransformer())
])

# Apply the pipeline to the dataframe

conversion_pipeline = Pipeline([
                    ('replace_chars', ReplaceCharsTransformer(columns=['budget','opening_theaters','world_revenue','runtimeMinutes','genres_y','job','characters','birthYear','deathYear','knownForTitles'])),
                    ('passthrough', 'passthrough')
])
mpaa_missing_pipeline = Pipeline([
                    ('imputer', SimpleImputer(strategy='most_frequent'))
])
                
manual_nan_pipeline = Pipeline([
                    ('fillna_runtime', FillnaTransformer(column='runtimeMinutes', dictionary=p)),
                    ('fillna_opening', FillnaTransformer(column='opening_theaters', dictionary=d)),
                    ('fill_budgets', FillMissingBudgets(df3))
])
cols_drop = ['Unnamed: 0','genres_y','domestic_revenue','opening_revenue','nconst','deathYear','job','characters','birthYear','primaryProfession','knownForTitles','isAdult','titleType','tconst']
pipeline = Pipeline([
    ('drop_cols', DropColumnsTransformer(columns_to_drop=cols_drop))
])  
df = pd.read_csv('main.csv')

df_clean = pipeline_duplicate.fit_transform(df)
df_clean = conversion_pipeline.fit_transform(df_clean)
df_clean['MPAA'] = mpaa_missing_pipeline.fit_transform(df_clean[['MPAA']])

new_df = manual_nan_pipeline.fit_transform(df_clean)
transformed_df = pipeline.fit_transform(new_df)

# Print the head of the DataFrame to confirm the changes
transformed_df.head()


Unnamed: 0,originalTitle,world_revenue,distributor,opening_theaters,budget,MPAA,genres_x,release_days,startYear,runtimeMinutes,averageRating,numVotes,ordering,category,primaryName
0,Super 30,24701637,Reliance Big Pictures,317.0,24701637.0,R,"Biography,Drama",173,2019,154.0,7.9,33523,9,producer,Sajid Nadiadwala
1,Ad Astra,127461872,Twentieth Century Fox,3460.0,90000000.0,PG-13,"Adventure,Drama,Mystery,Sci-Fi,Thriller",105,2019,123.0,6.5,239699,9,producer,Jeremy Kleiner
2,The Art of Self-Defense,2414269,Bleecker Street Media,7.0,2414269.0,R,"Action,Comedy,Crime,Drama,Mystery,Thriller",173,2019,104.0,6.6,38502,9,producer,Stephanie Whonsetler
3,Welcome to Marwen,13061491,Universal Pictures,1911.0,13061491.0,PG-13,"Biography,Comedy,Drama,Fantasy,Romance",376,2018,116.0,6.2,24570,9,producer,Steve Starkey
4,Welcome to Marwen,13061491,Universal Pictures,1911.0,13061491.0,PG-13,"Biography,Comedy,Drama,Fantasy,Romance",376,2018,116.0,6.2,24570,9,producer,Steve Starkey


In [12]:

transformed_df.isnull().sum()

originalTitle       0
world_revenue       0
distributor         0
opening_theaters    6
budget              0
MPAA                0
genres_x            0
release_days        0
startYear           0
runtimeMinutes      0
averageRating       0
numVotes            0
ordering            0
category            0
primaryName         0
dtype: int64

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler

class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        self.labels_ordered_ = {}
        categorical_features = [feature for feature in X.columns if X[feature].dtype == 'O']
        for feature in categorical_features:
            labels_ordered = X.groupby([feature]).size().sort_values().index
            labels_ordered = {value: index for index, value in enumerate(labels_ordered, 0)}
            self.labels_ordered_[feature] = labels_ordered
            
        return self
    
    def transform(self, X, y=None):
        for feature, labels_ordered in self.labels_ordered_.items():
            X[feature] = X[feature].map(labels_ordered)
        return X.dropna()

# Create the pipeline
new_pipeline = Pipeline([
    ('encoder', CategoricalEncoder()),
    ('scaler', MinMaxScaler())
])

# Fit and transform the data
coded = new_pipeline.fit_transform(transformed_df)

In [5]:
coded

array([[6.25000000e-02, 8.76942783e-03, 3.94495413e-01, ...,
        1.00000000e+00, 1.00000000e+00, 3.99481193e-01],
       [8.07180851e-01, 4.54788780e-02, 8.99082569e-01, ...,
        1.00000000e+00, 1.00000000e+00, 1.42671855e-01],
       [7.84574468e-02, 8.07622132e-04, 9.17431193e-01, ...,
        1.00000000e+00, 1.00000000e+00, 2.72373541e-01],
       ...,
       [9.08244681e-01, 1.23819442e-02, 6.97247706e-01, ...,
        1.00000000e+00, 7.00000000e-01, 2.77561608e-01],
       [9.08244681e-01, 1.23819442e-02, 6.97247706e-01, ...,
        1.00000000e+00, 6.00000000e-01, 3.92996109e-01],
       [8.37765957e-02, 9.94185282e-02, 1.00000000e+00, ...,
        3.75000000e-01, 5.00000000e-01, 5.35667964e-01]])

In [50]:

feature_scale=[feature for feature in coded.columns if feature not in ['world_revenue']]

from sklearn.preprocessing import MinMaxScaler

scaler=MinMaxScaler()

scaler.fit(coded[feature_scale])

data = pd.concat([coded[['world_revenue']].reset_index(drop=True),
                    pd.DataFrame(scaler.transform(coded[feature_scale]), columns=feature_scale)],
                    axis=1)


In [51]:
data.head()

Unnamed: 0,world_revenue,originalTitle,distributor,opening_theaters,budget,MPAA,genres_x,release_days,startYear,runtimeMinutes,averageRating,numVotes,ordering,category,primaryName
0,24701637,0.0625,0.394495,0.01585,0.005729,1.0,0.97619,0.228228,0.25,0.726667,0.772152,0.025749,1.0,1.0,0.399481
1,127461872,0.807181,0.899083,0.173,0.02088,0.75,0.027211,0.126126,0.25,0.52,0.594937,0.184138,1.0,1.0,0.142672
2,2414269,0.078457,0.917431,0.00035,0.000558,1.0,0.353741,0.228228,0.25,0.393333,0.607595,0.029574,1.0,1.0,0.272374
3,13061491,0.849734,1.0,0.09555,0.003028,0.75,0.602041,0.533033,0.0,0.473333,0.556962,0.018871,1.0,1.0,0.835279
4,13061491,0.849734,1.0,0.09555,0.003028,0.75,0.602041,0.533033,0.0,0.473333,0.556962,0.018871,1.0,1.0,0.835279


In [None]:
DataTransformationArtifact = namedtuple("DataTransformationArtifact",
 ["is_transformed", "message", "transformed_train_file_path","transformed_test_file_path",
     "preprocessed_object_file_path"])
class DataTransform:
    def __init__(self, data_transformation: DataTransformConfig,
                 data_ingestion_artifact: DataIngestionArtifact,
                 data_validation_artifact: DataValidationArtifact) -> None:
        try:
            
            self.data_transformation_config = data_transformation
            self.data_ingestion_artifact = data_ingestion_artifact
            self.data_validation_artifact = data_validation_artifact

        except Exception as e:
            return e
        
    def run_data_transformation(self)->DataTransformationArtifact:
        try:
            
            preprocessing_obj = self.get_data_transformer_object()

            #logging.info(f"Obtaining training and test file path.")
            train_file_path = self.data_ingestion_artifact.train_file_path
            test_file_path = self.data_ingestion_artifact.test_file_path
            

            #schema_file_path = self.data_validation_artifact.schema_file_path
            datatset_schema = read_yaml(DATA_VALIDATION_FILE)
            # logging.info(f"Loading training and test data as pandas dataframe.")
            train_df = load_data(file_path=train_file_path, schema_file_path=datatset_schema)
            
            test_df = load_data(file_path=test_file_path, schema_file_path=datatset_schema)

            schema = read_yaml(file_path=datatset_schema)

            target_column_name = schema[TARGET_COLUMN_KEY]


            #logging.info(f"Splitting input and target feature from training and testing dataframe.")
            input_feature_train_df = train_df.drop(columns=[target_column_name],axis=1)
            target_feature_train_df = train_df[target_column_name]

            input_feature_test_df = test_df.drop(columns=[target_column_name],axis=1)
            target_feature_test_df = test_df[target_column_name]
            

            #logging.info(f"Applying preprocessing object on training dataframe and testing dataframe")
            input_feature_train_arr=preprocessing_obj.fit_transform(input_feature_train_df)
            input_feature_test_arr = preprocessing_obj.transform(input_feature_test_df)


            train_arr = np.c_[ input_feature_train_arr, np.array(target_feature_train_df)]

            test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)]
            # root_dir: Path
            # tranfored_train_dir: Path
            # transormed_test_dir: Path
            # preprocessed_file_path: Path
            transformed_train_dir = self.data_transformation_config.tranfored_train_dir
            transformed_test_dir = self.data_transformation_config.transormed_test_dir
            #since we have numpy array data we replace file extension in npz
            train_file_name = os.path.basename(train_file_path).replace(".csv",".npz")
            test_file_name = os.path.basename(test_file_path).replace(".csv",".npz")

            transformed_train_file_path = os.path.join(transformed_train_dir, train_file_name)
            transformed_test_file_path = os.path.join(transformed_test_dir, test_file_name)

            #logging.info(f"Saving transformed training and testing array.")
            
            save_numpy_array_data(file_path=transformed_train_file_path,array=train_arr)
            save_numpy_array_data(file_path=transformed_test_file_path,array=test_arr)

            preprocessing_obj_file_path = self.data_transformation_config.preprocessed_file_path

            logging.info(f"Saving preprocessing object.")
            save_object(file_path=preprocessing_obj_file_path,obj=preprocessing_obj)

            data_transformation_artifact = DataTransformationArtifact(is_transformed=True,
            message="Data transformation successfull.",
            transformed_train_file_path=transformed_train_file_path,
            transformed_test_file_path=transformed_test_file_path,
            preprocessed_object_file_path=preprocessing_obj_file_path

            )
            logging.info(f"Data transformationa artifact: {data_transformation_artifact}")
            return data_transformation_artifact
        except Exception as e:
            return e


In [None]:
class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        self.labels_ordered_ = {}
        categorical_features = [feature for feature in X.columns if X[feature].dtype == 'O']
        for feature in categorical_features:
            labels_ordered = X.groupby([feature]).size().sort_values().index
            labels_ordered = {value: index for index, value in enumerate(labels_ordered, 0)}
            self.labels_ordered_[feature] = labels_ordered
        return self
    
    def transform(self, X, y=None):
        for feature, labels_ordered in self.labels_ordered_.items():
            X[feature] = X[feature].map(labels_ordered)
        return X.dropna()

all_columns = dataset_schema[DATASET_SCHEMA_COLUMNS_KEY]
encoding_scaling_pipeline = Pipeline([
                    ('encoder', CategoricalEncoder()),
                     ('scaler', 
                      )
            ])
preprocessing = ColumnTransformer([
            ('encoding_scaling_pipeline', encoding_scaling_pipeline, all_columns)

            ])

error message: encoding_scaling_pipeline [not enough values to unpack (expected 2, got 1)]