In [105]:
import os
import sys
from IPython.display import Markdown, display, Image
import numpy as np
import pandas as pd
import random
import math
import dvc.api
sys.path.append(os.path.abspath(os.path.join('../scripts')))
from helper import Helper
from app_logger import App_Logger
import plots
from sklearn import preprocessing


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
helper = Helper()

In [6]:
train_df = helper.read_csv("../data/train.csv")
store_df = helper.read_csv("../data/store.csv")
test_df = helper.read_csv("../data/test.csv")

file read as csv
file read as csv
file read as csv


In [87]:
class CleanAndTransformTrainData:
    """
        this is a dataframe used to clean train dataframe
    """
    def __init__(self):
        pass
    
    def drop_closed_stores(self, df):
        
        try:
            cleaned = df.query("Open == 1")
            return cleaned
        except:
            pass
        
    def convert_to_datatime(self, df):  
        try:
            df['Date'] = pd.to_datetime(train_df['Date'])
            return df
        except:
            pass
    
    def sort_by_date(self, df):
        return df.sort_values(by=["Date"], ascending=False)
            
    
    def to_str(self, df):
        df['StateHoliday'] = df['StateHoliday'].astype(str)
        return df
    
    
    """Extracts Day Month and Year from Date"""
    def transform_date(self, df):
        
        df['Date'] = pd.to_datetime(df['Date'])
        df['Year'] = pd.DatetimeIndex(df['Date']).year
        df['Month'] = pd.DatetimeIndex(df['Date']).month
        df['Day'] = pd.DatetimeIndex(df['Date']).day
        df['DayInMonth'] = df['Day'].apply(lambda x: self.to_month_category(x))
        return df
    
    """add called weekends """
    def add_weekday_col(self, df):
      
        df["Weekends"] = df["DayOfWeek"].apply(lambda x: 1 if x > 5 else 0)
        return df
    
    def to_month_category(self, value):
        try:
            
            if (value >= 1 and int(value) < 10):
                return "BegMonth"

            elif (value >= 10 and value < 20):
                return "MidMonth"
            else:
                return "EndMonth"
        except:
            pass
        
    def get_cleaned(self, df):
        df = self.drop_closed_stores(df)
        df = self.to_str(df)
        df = self.convert_to_datatime(df)
        df = self.transform_date(df)
        df = self.add_weekday_col(df)
#         df = self.to_month_category(df)
        return df
     
   
    

In [88]:
class cleanStoreDf:
    """ This is a class to clean store df"""
    
    def __init__(self):
        pass
    
    def handle_missing_value(self, df):
        """We handled CompetitionDistance by replacing it with median"""
        
        
        df['CompetitionDistance'] = df['CompetitionDistance'].fillna(df['CompetitionDistance'].max())
        df['Promo2SinceWeek'] = df['Promo2SinceWeek'].fillna(df['Promo2SinceWeek'].max())
        df['Promo2SinceYear'] = df['Promo2SinceYear'].fillna(df['Promo2SinceWeek'].max())
        df['PromoInterval'] = df['PromoInterval'].fillna(df['PromoInterval'].mode()[0])
        df['CompetitionOpenSinceYear'] = df['CompetitionOpenSinceYear'].fillna(df['CompetitionOpenSinceYear'].mode()[0])
        df['CompetitionOpenSinceMonth'] = df['CompetitionOpenSinceMonth'].fillna(df['CompetitionOpenSinceMonth'].mode()[0])
        
        return df
    def get_cleaned(self, df):
        return self.handle_missing_value(df)
             

In [145]:
class PreprocessRossmanData:
    
    def __init__(self):
        pass
    
    def encode_train_data(self, df):
        
        StateHolidayEncoder = preprocessing.LabelEncoder()
        DayInMonthEncoder = preprocessing.LabelEncoder()
        
        df['StateHoliday'] = StateHolidayEncoder.fit_transform(df['StateHoliday'])
        df['DayInMonth'] = DayInMonthEncoder.fit_transform(df['DayInMonth'])
        return df
        
    def encode_store_data(self, df):
        StoreTypeEncoder = preprocessing.LabelEncoder()
        AssortmentEncoder = preprocessing.LabelEncoder()
        PromoIntervalEncoder = preprocessing.LabelEncoder()

        
#         PromoInterval
        df['StoreType'] = StoreTypeEncoder.fit_transform(df['StoreType'])
        df['Assortment'] = AssortmentEncoder.fit_transform(df['Assortment'])
        df['PromoInterval'] = PromoIntervalEncoder.fit_transform(df['PromoInterval'])

        return df
    
    def merge_encoded(self, train_enc, store_enc):
        return pd.merge(train_enc, store_enc, on="Store")
    
    def process(self, train_df, store_df):
        
        enc_train = self.encode_train_data(train_df)
        enc_store = self.encode_store_data(store_df)
        enc_train = enc_train.drop(columns=["Date"], axis=1)
        merged = self.merge_encoded(enc_train, enc_store)
        sclaed = helper.scaler(merged, merged.columns.to_list(), mode="standard")
        
        return sclaed
    
        
        
        
        
        
    

In [146]:

clean_train_df = CleanAndTransformTrainData().get_cleaned(train_df)
clean_store_df = cleanStoreDf().get_cleaned(store_df)


In [149]:
processed_train_data = PreprocessRossmanData().process(clean_train_df, clean_store_df)