In [105]:
import os
import sys
from IPython.display import Markdown, display, Image
import numpy as np
import pandas as pd
import random
import math
import dvc.api
sys.path.append(os.path.abspath(os.path.join('../scripts')))
from helper import Helper
from app_logger import App_Logger
import plots
from sklearn import preprocessing


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
helper = Helper()

In [6]:
train_df = helper.read_csv("../data/train.csv")
store_df = helper.read_csv("../data/store.csv")
test_df = helper.read_csv("../data/test.csv")

file read as csv
file read as csv
file read as csv


In [87]:
class CleanAndTransformTrainData:
    """
        this is a dataframe used to clean train dataframe
    """
    def __init__(self):
        pass
    
    def drop_closed_stores(self, df):
        
        try:
            cleaned = df.query("Open == 1")
            return cleaned
        except:
            pass
        
    def convert_to_datatime(self, df):  
        try:
            df['Date'] = pd.to_datetime(train_df['Date'])
            return df
        except:
            pass
    
    def sort_by_date(self, df):
        return df.sort_values(by=["Date"], ascending=False)
            
    
    def to_str(self, df):
        df['StateHoliday'] = df['StateHoliday'].astype(str)
        return df
    
    
    """Extracts Day Month and Year from Date"""
    def transform_date(self, df):
        
        df['Date'] = pd.to_datetime(df['Date'])
        df['Year'] = pd.DatetimeIndex(df['Date']).year
        df['Month'] = pd.DatetimeIndex(df['Date']).month
        df['Day'] = pd.DatetimeIndex(df['Date']).day
        df['DayInMonth'] = df['Day'].apply(lambda x: self.to_month_category(x))
        return df
    
    """add called weekends """
    def add_weekday_col(self, df):
      
        df["Weekends"] = df["DayOfWeek"].apply(lambda x: 1 if x > 5 else 0)
        return df
    
    def to_month_category(self, value):
        try:
            
            if (value >= 1 and int(value) < 10):
                return "BegMonth"

            elif (value >= 10 and value < 20):
                return "MidMonth"
            else:
                return "EndMonth"
        except:
            pass
        
    def get_cleaned(self, df):
        df = self.drop_closed_stores(df)
        df = self.to_str(df)
        df = self.convert_to_datatime(df)
        df = self.transform_date(df)
        df = self.add_weekday_col(df)
#         df = self.to_month_category(df)
        return df
     
   
    

In [88]:
class cleanStoreDf:
    """ This is a class to clean store df"""
    
    def __init__(self):
        pass
    
    def handle_missing_value(self, df):
        """We handled CompetitionDistance by replacing it with median"""
        
        
        df['CompetitionDistance'] = df['CompetitionDistance'].fillna(df['CompetitionDistance'].max())
        df['Promo2SinceWeek'] = df['Promo2SinceWeek'].fillna(df['Promo2SinceWeek'].max())
        df['Promo2SinceYear'] = df['Promo2SinceYear'].fillna(df['Promo2SinceWeek'].max())
        df['PromoInterval'] = df['PromoInterval'].fillna(df['PromoInterval'].mode()[0])
        df['CompetitionOpenSinceYear'] = df['CompetitionOpenSinceYear'].fillna(df['CompetitionOpenSinceYear'].mode()[0])
        df['CompetitionOpenSinceMonth'] = df['CompetitionOpenSinceMonth'].fillna(df['CompetitionOpenSinceMonth'].mode()[0])
        
        return df
    def get_cleaned(self, df):
        return self.handle_missing_value(df)
             

In [136]:
class PreprocessRossmanData:
    
    def __init__(self):
        pass
    
    def encode_train_data(self, df):
        
        StateHolidayEncoder = preprocessing.LabelEncoder()
        DayInMonthEncoder = preprocessing.LabelEncoder()
        
        df['StateHoliday'] = StateHolidayEncoder.fit_transform(df['StateHoliday'])
        df['DayInMonth'] = DayInMonthEncoder.fit_transform(df['DayInMonth'])
        return df
        
    def encode_store_data(self, df):
        StoreTypeEncoder = preprocessing.LabelEncoder()
        AssortmentEncoder = preprocessing.LabelEncoder()
        PromoIntervalEncoder = preprocessing.LabelEncoder()

        
#         PromoInterval
        df['StoreType'] = StoreTypeEncoder.fit_transform(df['StoreType'])
        df['Assortment'] = AssortmentEncoder.fit_transform(df['Assortment'])
        df['PromoInterval'] = PromoIntervalEncoder.fit_transform(df['PromoInterval'])

        return df
    
    def merge_encoded(self, train_enc, store_enc):
        return pd.merge(train_enc, store_enc, on="Store")
    
    def process(self, train_df, store_df):
        
        enc_train = self.encode_train_data(train_df)
        enc_store = self.encode_store_data(store_df)
        enc_train = enc_train.drop(columns=["Date"], axis=1)
        merged = self.merge_encoded(enc_train, enc_store)
        sclaed helper.scaler(merged, merged.columns.to_list(), mode="standard")
        
        return self.merge_encoded(enc_train, enc_store)
    
        
        
        
        
        
    

In [137]:

clean_train_df = CleanAndTransformTrainData().get_cleaned(train_df)
clean_store_df = cleanStoreDf().get_cleaned(store_df)


In [138]:
res= PreprocessRossmanData().process(clean_train_df, clean_store_df)

In [119]:
def encode_labels(df):
    date_encoder = preprocessing.LabelEncoder()
    device_encoder = preprocessing.LabelEncoder()
    browser_encoder = preprocessing.LabelEncoder()
    experiment_encoder = preprocessing.LabelEncoder()
    aware_encoder = preprocessing.LabelEncoder()
    
    df['date'] = date_encoder.fit_transform(df['date'])
    df['device_make'] = device_encoder.fit_transform(df['device_make'])
    df['browser'] = browser_encoder.fit_transform(df['browser'])
    df['experiment'] = experiment_encoder.fit_transform(cleaned_df['experiment'])
    df['browser'] = aware_encoder.fit_transform(df['browser'])
    df['aware'] = aware_encoder.fit_transform(df['aware'])


    
    return df

In [139]:
res.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 844392 entries, 0 to 844391
Data columns (total 22 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Store                      844392 non-null  int64  
 1   DayOfWeek                  844392 non-null  int64  
 2   Sales                      844392 non-null  int64  
 3   Customers                  844392 non-null  int64  
 4   Open                       844392 non-null  int64  
 5   Promo                      844392 non-null  int64  
 6   StateHoliday               844392 non-null  int64  
 7   SchoolHoliday              844392 non-null  int64  
 8   Year                       844392 non-null  int64  
 9   Month                      844392 non-null  int64  
 10  Day                        844392 non-null  int64  
 11  DayInMonth                 844392 non-null  int64  
 12  Weekends                   844392 non-null  int64  
 13  StoreType                  84

In [123]:
clean_train_df.drop(columns=["Date"], axis=1)

Unnamed: 0,Store,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Year,Month,Day,DayInMonth,Weekends
0,1,5,5263,555,1,1,0,1,2015,7,31,1,0
1,2,5,6064,625,1,1,0,1,2015,7,31,1,0
2,3,5,8314,821,1,1,0,1,2015,7,31,1,0
3,4,5,13995,1498,1,1,0,1,2015,7,31,1,0
4,5,5,4822,559,1,1,0,1,2015,7,31,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1016776,682,2,3375,566,1,0,1,1,2013,1,1,0,0
1016827,733,2,10765,2377,1,0,1,1,2013,1,1,0,0
1016863,769,2,5035,1248,1,0,1,1,2013,1,1,0,0
1017042,948,2,4491,1039,1,0,1,1,2013,1,1,0,0


In [142]:
helper.scaler(res, res.columns.to_list(), mode="standard")

Unnamed: 0,Store,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Year,Month,...,Weekends,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,-1.732571,0.858414,-0.545231,-0.517732,0.0,1.113726,-0.029796,2.041038,1.502796,0.347258,...,-0.460344,0.582814,-0.942988,-0.509313,0.435913,-0.387864,-0.997372,-0.757527,-0.997371,0.044677
1,-1.732571,0.278263,-0.623512,-0.540163,0.0,1.113726,-0.029796,2.041038,1.502796,0.347258,...,-0.460344,0.582814,-0.942988,-0.509313,0.435913,-0.387864,-0.997372,-0.757527,-0.997371,0.044677
2,-1.732571,-0.301888,-0.700182,-0.597488,0.0,1.113726,-0.029796,2.041038,1.502796,0.347258,...,-0.460344,0.582814,-0.942988,-0.509313,0.435913,-0.387864,-0.997372,-0.757527,-0.997371,0.044677
3,-1.732571,-0.882040,-0.626411,-0.505271,0.0,1.113726,-0.029796,2.041038,1.502796,0.347258,...,-0.460344,0.582814,-0.942988,-0.509313,0.435913,-0.387864,-0.997372,-0.757527,-0.997371,0.044677
4,-1.732571,-1.462191,-0.274954,-0.375668,0.0,1.113726,-0.029796,2.041038,1.502796,0.347258,...,-0.460344,0.582814,-0.942988,-0.509313,0.435913,-0.387864,-0.997372,-0.757527,-0.997371,0.044677
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
844387,-0.828090,-1.462191,0.752360,0.596349,0.0,1.113726,-0.029796,-0.489947,-1.070346,-1.457834,...,-0.460344,-0.884146,-0.942988,-0.529125,-0.644490,-0.200162,-0.997372,-0.757527,-0.997371,0.044677
844388,-0.828090,1.438566,-1.355421,-1.053588,0.0,-0.897887,-0.029796,-0.489947,-1.070346,-1.457834,...,2.172288,-0.884146,-0.942988,-0.529125,-0.644490,-0.200162,-0.997372,-0.757527,-0.997371,0.044677
844389,-0.828090,0.858414,-0.887025,-0.505271,0.0,-0.897887,-0.029796,2.041038,-1.070346,-1.457834,...,-0.460344,-0.884146,-0.942988,-0.529125,-0.644490,-0.200162,-0.997372,-0.757527,-0.997371,0.044677
844390,-0.828090,0.278263,-0.765255,-0.251051,0.0,-0.897887,-0.029796,2.041038,-1.070346,-1.457834,...,-0.460344,-0.884146,-0.942988,-0.529125,-0.644490,-0.200162,-0.997372,-0.757527,-0.997371,0.044677


In [141]:
clean_train_df.columns.to_list()

['Store',
 'DayOfWeek',
 'Date',
 'Sales',
 'Customers',
 'Open',
 'Promo',
 'StateHoliday',
 'SchoolHoliday',
 'Year',
 'Month',
 'Day',
 'DayInMonth',
 'Weekends']

In [144]:
res[['Store']]

TypeError: bad operand type for unary ~: 'str'