In [1]:
import os
os.chdir("../")

In [2]:
ls

[0m[01;34martifacts[0m/     logistic_regression_scratch.ipynb  setup.cfg
[01;34mbuild[0m/         [01;34mlogs[0m/                              setup.py
[01;34mconfigs[0m/       params.yaml                        [01;34msrc[0m/
[01;34mdist[0m/          pyproject.toml                     template.py
dvc.yaml       README.md                          [01;34mtests[0m/
[01;34menv[0m/           requirements_dev.txt               tox.ini
init_setup.sh  requirements.txt
LICENSE        [01;34mresearch[0m/


In [3]:

from src.movie_predictor.constants import *

In [4]:
import os
from box.exceptions import BoxValueError
import yaml
from src.movie_predictor import logger
import json
import joblib
from ensure import ensure_annotations
from box import ConfigBox
from pathlib import Path
from typing import Any

def read_yaml(path_to_yaml: Path) -> ConfigBox:
    """reads yaml file and returns
    Args:
        path_to_yaml (str): input is path
    Raises:
        ValueError: if yaml file is empty
        e: empty file
    Returns:
        ConfigBox: ConfigBox type
    """
    try:
        with open(path_to_yaml) as yaml_file:
            config_yaml = yaml.safe_load(yaml_file)
            logger.info(f"yaml file: {path_to_yaml} loaded successfully")
            return ConfigBox(config_yaml)
    except BoxValueError:
        raise ValueError("yaml file is empty")
    except Exception as e:
        raise e

@ensure_annotations
def create_directories(path_to_directories: list, verbose=True):
    """create list of directories
    Args:
        path_to_directories (list): list of path of directories
        ignore_log (bool, optional): ignore if multiple dirs is to be created. Defaults to False.
    """
    for path in path_to_directories:
        os.makedirs(path, exist_ok=True)
        if verbose:
            logger.info(f"created directory at: {path}")

@ensure_annotations
def save_json(path: Path, data: dict):
    """save json data
    Args:
        path (Path): path to json file
        data (dict): data to be saved in json file
    """
    with open(path, "w") as f:
        json.dump(data, f, indent=4)

    logger.info(f"json file saved at: {path}")

@ensure_annotations
def load_json(path: Path) -> ConfigBox:
    """load json files data
    Args:
        path (Path): path to json file
    Returns:
        ConfigBox: data as class attributes instead of dict
    """
    with open(path) as f:
        config_yaml = json.load(f)

    logger.info(f"json file loaded succesfully from: {path}")
    return ConfigBox(config_yaml)

@ensure_annotations
def save_bin(data: Any, path: Path):
    """save binary file
    Args:
        data (Any): data to be saved as binary
        path (Path): path to binary file
    """
    joblib.dump(value=data, filename=path)
    logger.info(f"binary file saved at: {path}")

@ensure_annotations
def load_bin(path: Path) -> Any:
    """load binary data
    Args:
        path (Path): path to binary file
    Returns:
        Any: object stored in the file
    """
    data = joblib.load(path)
    logger.info(f"binary file loaded from: {path}")
    return data

@ensure_annotations
def get_size(path: Path) -> str:
    """get size in KB
    Args:
        path (Path): path of the file
    Returns:
        str: size in KB
    """
    size_in_kb = round(os.path.getsize(path)/1024)
    return f"~ {size_in_kb} KB"

In [5]:
from dataclasses import dataclass
from pathlib import Path
from collections import namedtuple



@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    zip_data_file: Path
    unzip_dir: Path
    ingested_train_dir: Path
    ingested_test_dir: Path


In [6]:
class ConfigurationManager:
    def __init__(
        self, 
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        self.schema = read_yaml(DATA_VALIDATION_FILE)

        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])
        create_directories([config.ingested_train_dir])
        create_directories([config.ingested_test_dir])
        

        data_ingestion_config = DataIngestionConfig(
            root_dir = config.root_dir,
            source_URL = config.source_URL,
            zip_data_file = config.zip_data_file,
            unzip_dir = config.unzip_dir,
            ingested_train_dir = config.ingested_train_dir,
            ingested_test_dir = config.ingested_test_dir
        )

        return data_ingestion_config

In [7]:
yaml_read = read_yaml(CONFIG_FILE_PATH)

In [8]:
yaml_read.artifacts_root

'artifacts'

In [9]:
yaml_read.artifacts_root

'artifacts'

In [10]:
yaml_read.data_ingestion

ConfigBox({'root_dir': 'artifacts/data_ingestion', 'source_URL': 'https://github.com/roshikdahal/movie_prediction/raw/main/research/movie.tgz', 'zip_data_file': 'artifacts/data_ingestion/raw_data', 'unzip_dir': 'artifacts/data_ingestion/local_data', 'ingested_train_dir': 'artifacts/data_ingestion/train', 'ingested_test_dir': 'artifacts/data_ingestion/test'})

In [11]:
config = yaml_read.data_ingestion
DataIngestionConfig(
            root_dir = config.root_dir,
            source_URL = config.source_URL,
            zip_data_file = config.zip_data_file,
            unzip_dir = config.unzip_dir,
            ingested_train_dir = config.ingested_train_dir,
            ingested_test_dir = config.ingested_test_dir
        )

DataIngestionConfig(root_dir='artifacts/data_ingestion', source_URL='https://github.com/roshikdahal/movie_prediction/raw/main/research/movie.tgz', zip_data_file='artifacts/data_ingestion/raw_data', unzip_dir='artifacts/data_ingestion/local_data', ingested_train_dir='artifacts/data_ingestion/train', ingested_test_dir='artifacts/data_ingestion/test')

In [14]:
from collections import namedtuple


DataIngestionArtifact = namedtuple("DataIngestionArtifact",
[ "train_file_path", "test_file_path", "is_ingested", "message"])

In [15]:
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
import tarfile
import urllib.request as request
class DataIngestion:
    
    def __init__(self,dataingestionconfig:DataIngestionConfig):
        try:
            
            self.data_ingestion_config = dataingestionconfig
        except Exception as e:
            return e

    def download_Movies_data(self,) -> str:
        """
        this function get the url create the folder and store the downloaded data 
        in the folder which will be our zip folder.
        """
        try:
            data_url  =self.data_ingestion_config.source_URL
            tgz_download_dir = self.data_ingestion_config.zip_data_file
            os.makedirs(tgz_download_dir,exist_ok=True)

            #create the folder name based on url base name
            movie_file = os.path.basename(tgz_download_dir)
            #now append the movie filename in download_dir
            total_file_path = os.path.join(tgz_download_dir,movie_file)

            start_time=time.time()
            request.urlretrieve(data_url, total_file_path)
            stop_time = time.time()
            return total_file_path

        except Exception as e:
            return e

    #since we have downloaded data and store it in total_file_path which is of extension tgz we need to unzip it and store in raw data
    def extract_zip_file(self,total_file_path:str):
        try:
            raw_folder = self.data_ingestion_config.unzip_dir
            #create the folder is exists is True also
            os.makedirs(raw_folder,exist_ok=True)   

            with tarfile.open(total_file_path) as movies_file_object:
                movies_file_object.extractall(path=raw_folder)
        except Exception as e:
            return e

    
    def train_test_split(self)-> DataIngestionArtifact:
            """
            spliting data into train test and appending it on data ingestion artifact_entity
            """
            try:
                raw_data = self.data_ingestion_config.unzip_dir
                # #pick the first folder and get the data from first file 
                #main_folder= os.listdir(raw_data)[0]
                # #now merge folder to get propoer file path
                # movies_file_path = os.path.join(raw_data,main_folder)
                # logging.info("Reading the movies csv file [{raw_data}]")
                #read the csv file
                file_name = os.listdir(raw_data)[0]
                movies_file_path = os.path.join(raw_data,file_name)
                movies_df =  pd.read_csv(movies_file_path)
                
                #remaining to perform data split using stratified sampling

                #logging.info(f"Reading csv file: [{movies_file_path}]")
                #since world_revenue is our dependent variable we seprate the array element into bins and perform statistical analysis
                movies_df['world_revenue'] = movies_df.world_revenue.str.replace('$','',regex=True)
                movies_df['world_revenue'] = movies_df.world_revenue.str.replace(',','',regex=True)
                movies_df['world_revenue'] =  pd.to_numeric(movies_df['world_revenue'], errors='coerce')
                movies_df["revenue_cat"] = pd.cut(
                    movies_df["world_revenue"],
                    bins=[0.0, 1.5, 3.0, 4.5, 6.0, np.inf],  #category for our world revenue 0.0 and 1.5 one group 1.5 to 3.0 another
                    labels=[1,2,3,4,5]  #names
                )
                

                # logging.info(f"Splitting data into train and test")
                strat_train_set = None
                strat_test_set = None
                #we are using StratifiedShuffleSplit to create  split with the size of 80:20 and n_splits is 
                # number of times the data needs to be sampled for test_size and data's are taken randomly at 42 
                split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

                #our group is movies_df["income_cat"] so split function will split based on this category
                for train_index,test_index in split.split(movies_df, movies_df["revenue_cat"]):
                    #since only for split we use movies_df["income_cat"] we are droping from train and test split
                    strat_train_set = movies_df.loc[train_index].drop(["revenue_cat"],axis=1)
                    strat_test_set = movies_df.loc[test_index].drop(["revenue_cat"],axis=1)

                #for saving the strat_train_set and strat_test_set we have directory and our filename
       
                train_file_path = os.path.join(self.data_ingestion_config.ingested_train_dir,
                                            file_name)

                test_file_path = os.path.join(self.data_ingestion_config.ingested_test_dir,
                                                        file_name)
                
    
                if strat_train_set is not None:
                    os.makedirs(self.data_ingestion_config.ingested_train_dir,exist_ok=True)
                    #logging.info(f"Exporting training datset to file: [{train_file_path}]")
                    strat_train_set.to_csv(train_file_path,index=False)

                if strat_test_set is not None:
                    os.makedirs(self.data_ingestion_config.ingested_test_dir, exist_ok= True)
                    #logging.info(f"Exporting test dataset to file: [{test_file_path}]")
                    strat_test_set.to_csv(test_file_path,index=False)
                

                data_ingestion_artifact = DataIngestionArtifact(train_file_path=train_file_path,
                test_file_path=test_file_path,
                is_ingested=True,
                message=f"Data ingestion completed successfully."
                )







            
                # logging.info(f"Data Ingestion artifact:[{data_ingestion_artifact}]")
                return data_ingestion_artifact

            except Exception as e:
                return e            
      



In [None]:

config = ConfigurationManager()
data_ingestion_config = config.get_data_ingestion_config()
data_ingestion = DataIngestion(dataingestionconfig = data_ingestion_config)
tgz_file = data_ingestion.download_Movies_data()
data_ingestion.extract_zip_file(total_file_path = tgz_file)



In [None]:
data_ingestion.train_test_split()

DataIngestionArtifact(train_file_path='artifacts/data_ingestion/train/main.csv', test_file_path='artifacts/data_ingestion/test/main.csv', is_ingested=True, message='Data ingestion completed successfully.')

In [None]:
raw_data = data_ingestion_config.unzip_dir
file_name = os.listdir(raw_data)[0]

In [None]:
movies_file_path = os.path.join(raw_data,file_name)
movies_df =  pd.read_csv(movies_file_path)

In [None]:
movies_df

Unnamed: 0.1,Unnamed: 0,originalTitle,domestic_revenue,world_revenue,distributor,opening_revenue,opening_theaters,budget,MPAA,genres_x,...,ordering,nconst,category,job,characters,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,0,Super 30,"$2,269,878","$24,701,637",Reliance Big Pictures,"$871,256",317,,,"Biography,Drama",...,9,nm0618898,producer,producer,\N,Sajid Nadiadwala,1966,\N,"producer,writer,director","tt7518786,tt2372222,tt8366590,tt7721946"
1,1,Ad Astra,"$50,188,370","$127,461,872",Twentieth Century Fox,"$19,001,398",3460,"$90,000,000",PG-13,"Adventure,Drama,Mystery,Sci-Fi,Thriller",...,9,nm1250070,producer,producer,\N,Jeremy Kleiner,\N,\N,"producer,miscellaneous","tt2024544,tt1020072,tt4975722,tt7125860"
2,2,The Art of Self-Defense,"$2,410,914","$2,414,269",Bleecker Street Media,"$114,374",7,,R,"Action,Comedy,Crime,Drama,Mystery,Thriller",...,9,nm3442546,producer,producer,\N,Stephanie Whonsetler,\N,\N,"production_manager,miscellaneous,producer","tt6269368,tt10962368,tt7339248,tt4595186"
3,3,Welcome to Marwen,"$10,763,520","$13,061,491",Universal Pictures,"$2,354,205",1911,,PG-13,"Biography,Comedy,Drama,Fantasy,Romance",...,9,nm0823330,producer,producer,\N,Steve Starkey,\N,\N,"producer,assistant_director,editorial_department","tt0109830,tt0118884,tt1907668,tt0162222"
4,4,Welcome to Marwen,"$10,763,520","$13,061,491",Universal Pictures,"$2,354,205",1911,,PG-13,"Biography,Comedy,Drama,Fantasy,Romance",...,9,nm0823330,producer,producer,\N,Steve Starkey,\N,\N,"producer,assistant_director,editorial_department","tt0109830,tt0118884,tt1907668,tt0162222"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
931,931,The House,"$25,584,504","$34,184,504",Warner Bros.,"$8,724,795",3134,"$40,000,000",R,"Comedy,Crime",...,9,nm0909629,writer,written by,\N,Enda Walsh,1967,\N,"writer,director,actor","tt11703050,tt0986233,tt0236157,tt21211282"
932,932,Gifted,"$24,801,212","$43,046,590",Fox Searchlight,"$446,380",56,"$7,000,000",PG-13,Drama,...,9,nm5537416,producer,producer,\N,Edwin Sherman,\N,\N,producer,"tt3758280,tt2393799,tt3727750"
933,933,Kidnap,"$30,971,040","$34,814,102",Aviron Pictures,"$10,016,323",2378,"$21,000,000",R,"Action,Crime,Thriller",...,9,nm5762850,cinematographer,\N,\N,Souvik Basu,\N,\N,"cinematographer,camera_department","tt10626906,tt23781422,tt10300662,tt5534436"
934,934,Kidnap,"$30,971,040","$34,814,102",Aviron Pictures,"$10,016,323",2378,"$21,000,000",R,"Action,Crime,Thriller",...,9,nm11940155,editor,\N,\N,Renjith Surendran,\N,\N,"editor,editorial_department","tt13192140,tt21094962,tt15554854"


In [None]:
movies_df['world_revenue'] = movies_df.world_revenue.str.replace('$','',regex=True)
movies_df['world_revenue'] = movies_df.world_revenue.str.replace(',','',regex=True)
movies_df['world_revenue'] =  pd.to_numeric(movies_df['world_revenue'], errors='coerce')

In [None]:
movies_df["revenue_cat"] = pd.cut(
                    movies_df["world_revenue"],
                    bins=[0.0, 1.5, 3.0, 4.5, 6.0, np.inf],  #category for our world revenue 0.0 and 1.5 one group 1.5 to 3.0 another
                    labels=[1,2,3,4,5]  #names
                )

In [None]:
strat_train_set = None
strat_test_set = None
#we are using StratifiedShuffleSplit to create  split with the size of 80:20 and n_splits is 
# number of times the data needs to be sampled for test_size and data's are taken randomly at 42 
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

#our group is movies_df["income_cat"] so split function will split based on this category
for train_index,test_index in split.split(movies_df, movies_df["revenue_cat"]):
    #since only for split we use movies_df["income_cat"] we are droping from train and test split
    strat_train_set = movies_df.loc[train_index].drop(["revenue_cat"],axis=1)
    strat_test_set = movies_df.loc[test_index].drop(["revenue_cat"],axis=1)
train_file_path = os.path.join(data_ingestion_config.ingested_train_dir,
                                            file_name)

test_file_path = os.path.join(data_ingestion_config.ingested_test_dir,
                                        file_name)




            
if strat_train_set is not None:
    os.makedirs(data_ingestion_config.ingested_train_dir,exist_ok=True)
    #logging.info(f"Exporting training datset to file: [{train_file_path}]")
    strat_train_set.to_csv(train_file_path,index=False)

if strat_test_set is not None:
    os.makedirs(data_ingestion_config.ingested_test_dir, exist_ok= True)
    #logging.info(f"Exporting test dataset to file: [{test_file_path}]")
    strat_test_set.to_csv(test_file_path,index=False)


data_ingestion_artifact = DataIngestionArtifact(train_file_path=train_file_path,
        test_file_path=test_file_path,
        is_ingested=True,
        message=f"Data ingestion completed successfully."
        )  



In [None]:
data_ingestion_artifact

DataIngestionArtifact(train_file_path='artifacts/data_ingestion/train/main.csv', test_file_path='artifacts/data_ingestion/test/main.csv', is_ingested=True, message='Data ingestion completed successfully.')

In [None]:
data_ingestion_config.ingested_train_dir

'artifacts/data_ingestion/train'

In [None]:
train_file_path = os.path.join(data_ingestion_config.ingested_train_dir,
                                            file_name)

test_file_path = os.path.join(data_ingestion_config.ingested_test_dir,
                                        file_name)

In [None]:
train_file_path

'artifacts/data_ingestion/train/main.csv'

In [None]:
DataIngestionArtifact(train_file_path=train_file_path,
                test_file_path=test_file_path,
                is_ingested=True,
                message=f"Data ingestion completed successfully."
                )

DataIngestionArtifact(train_file_path='artifacts/data_ingestion/train/main.csv', test_file_path='artifacts/data_ingestion/test/main.csv', is_ingested=True, message='Data ingestion completed successfully.')

In [17]:


DataIngestionArtifact.train_file_path

__main__.DataIngestionArtifact