In [14]:
import pandas as pd
import numpy as np
import product_sub.settings as stg
from os.path import join
from pandas.tseries.offsets import MonthEnd

In [15]:
df_data = pd.read_csv('../data/raw/data.csv', sep=";")

In [16]:
class MarketingsCustomersCleaner:
    """
    Performs technical cleaning on job types

    Attributes
    ----------

    Methods
    -------
    """

    def __init__(self, filename):
        """Class initialization."""
        self.filename = filename
        pass

    @property
    def data(self):
        """Technical cleaning of customers informations from flat files

        Returns
        -------
        DataFrame
            DataFrame technical cleaned (without accents, Yes/No)

        Raises
        ------
        FileExistsError
            The file must be a csv file for now
        """
        if self.filename.endswith(".csv"):
            df = pd.read_csv(join(stg.RAW_DATA_DIR, self.filename), sep=";")
        else:
            raise FileExistsError("Extension must be csv.")

        df_bank = self._get_data_cleaned(df)
        return df_bank

    def _clean_accents(
        self, df, columns_to_clean_accents=[stg.COL_RAW_JOB, stg.COL_RAW_STATUS]
    ):
        df_without_accent = df.copy()
        for col in columns_to_clean_accents:
            df_without_accent[col] = (
                df[col]
                .str.normalize("NFKD")
                .str.encode("ascii", errors="ignore")
                .str.decode("utf-8")
            )
        return df_without_accent

    def _yes_no_converter(
        self,
        df,
        cols_to_binary_convert=[
            stg.COL_RAW_SUBSCRIPTION,
            stg.COL_RAW_HAS_HOUSING_LOAN,
            stg.COL_RAW_HAS_PERSO_LOAN,
            stg.COL_RAW_HAS_DEFAULT,
        ],
    ):
        df_yes_no = df.copy()
        for col in cols_to_binary_convert:
            df_yes_no[col] = df[col].replace(stg.BOOLEAN_ENCODING)
        return df_yes_no

    def _get_data_cleaned(self, df):
        df_with_no_accents = self._clean_accents(df)
        df_with_no_yes_no = self._yes_no_converter(df_with_no_accents)
        return df_with_no_yes_no



In [17]:
class SocioDataset:
    """
    Performs technical cleaning on job types

    Attributes
    ----------

    Methods
    -------
    """

    def __init__(self, filename):
        """Class initialization."""
        self.filename = filename
        pass

    @property
    def data(self):
        """Technical cleaning of customers informations from flat files

        Returns
        -------
        DataFrame
            DataFrame technical cleaned (without accents, Yes/No)

        Raises
        ------
        FileExistsError
            The file must be a csv file for now
        """
        if self.filename.endswith(".csv"):
            df = pd.read_csv(join(stg.RAW_DATA_DIR, self.filename), sep=";")
        else:
            raise FileExistsError("Extension must be csv.")

        return df
    
    
    
    


In [18]:
class MergeSocioBank():
    
    def __init__(self): 
        pass
    
    def _create_year_month_col(self, dataset_marketing, dataset_socio):
        df_market_year_month = self._add_year_month(dataset_marketing)
        df_socio_year_month = self._add_year_month(dataset_socio)
        return df_market_year_month, df_socio_year_month
                                               
    def _add_year_month(self, df):
        df_with_year_month = df.assign(
            **{
                stg.COL_YEAR_MONTH: lambda df: pd.to_datetime(df[stg.COL_RAW_DATE]).dt.to_period('M')
            })
        return df_with_year_month
    
    def merge(self, df_marketing, df_socio):
        df_marketing_with_year_month , df_socio_with_year_month = self._create_year_month_col(df_marketing, df_socio)
        df_merged = pd.merge(df_marketing_with_year_month,df_socio_with_year_month,how='left', on='YEAR/MONTH')
        cols_to_drop = [f'{stg.COL_RAW_DATE}_y', 'YEAR/MONTH']
        mapping_for_date = {f'{stg.COL_RAW_DATE}_x': stg.COL_RAW_DATE}
        df_merged_dropped = df_merged.drop(columns=cols_to_drop ).rename(columns=mapping_for_date)
        return df_merged_dropped
        



In [19]:
dataset_socio = SocioDataset("socio_eco.csv").data
dataset_socio

Unnamed: 0,DATE,EMPLOYMENT_VARIATION_RATE,IDX_CONSUMER_PRICE,IDX_CONSUMER_CONFIDENCE
0,2008-05-31,1.1,93.994,-36.4
1,2008-06-30,1.4,94.465,-41.8
2,2008-07-31,1.4,93.918,-42.7
3,2008-08-31,1.4,93.444,-36.1
4,2008-09-30,,,
5,2008-10-31,-0.1,93.798,-40.4
6,2008-11-30,-0.1,93.2,-42.0
7,2008-12-31,-0.2,92.756,-45.9
8,2009-01-31,,,
9,2009-02-28,,,


In [20]:
dataset_marketing = MarketingsCustomersCleaner("data.csv").data 
dataset_socio = SocioDataset("socio_eco.csv").data
df_merged = MergeSocioBank().merge(dataset_marketing , dataset_socio)

In [21]:
df_merged.IDX_CONSUMER_PRICE.mean()

93.57648274267355

In [22]:
dataset_socio['DATE'] = pd.to_datetime(dataset_socio['DATE'])+ pd.DateOffset(1)
dataset_socio['quarter'] = dataset_socio['DATE'].dt.to_period('Q')

In [23]:
dataset_socio

Unnamed: 0,DATE,EMPLOYMENT_VARIATION_RATE,IDX_CONSUMER_PRICE,IDX_CONSUMER_CONFIDENCE,quarter
0,2008-06-01,1.1,93.994,-36.4,2008Q2
1,2008-07-01,1.4,94.465,-41.8,2008Q3
2,2008-08-01,1.4,93.918,-42.7,2008Q3
3,2008-09-01,1.4,93.444,-36.1,2008Q3
4,2008-10-01,,,,2008Q4
5,2008-11-01,-0.1,93.798,-40.4,2008Q4
6,2008-12-01,-0.1,93.2,-42.0,2008Q4
7,2009-01-01,-0.2,92.756,-45.9,2009Q1
8,2009-02-01,,,,2009Q1
9,2009-03-01,,,,2009Q1


In [24]:
dataset_socio.groupby('quarter').mean()

Unnamed: 0_level_0,EMPLOYMENT_VARIATION_RATE,IDX_CONSUMER_PRICE,IDX_CONSUMER_CONFIDENCE
quarter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2008Q2,1.1,93.994,-36.4
2008Q3,1.4,93.942333,-40.2
2008Q4,-0.1,93.499,-41.2
2009Q1,-0.2,92.756,-45.9
2009Q2,-1.8,92.937,-47.766667
2009Q3,-2.9,92.544333,-35.266667
2009Q4,-3.4,92.486333,-28.933333
2010Q1,-3.0,92.713,-33.0
2010Q2,-1.8,93.664667,-36.466667
2010Q3,-1.7,94.099,-39.466667


In [25]:
empl_with_quarter = dataset_socio.groupby('quarter')['EMPLOYMENT_VARIATION_RATE'].mean()
dataset_socio['EMPLOYMENT_VARIATION_RATE'] = dataset_socio.apply
(lambda x : empl_with_quarter[x['quarter']] if np.isnan(x['EMPLOYMENT_VARIATION_RATE']) else x['EMPLOYMENT_VARIATION_RATE'], axis=1)

SyntaxError: invalid syntax (<ipython-input-25-61834a2810fa>, line 3)

In [None]:
#dataset_socio['IDX_CONSUMER_PRICE'].median()

In [None]:
res['2008Q2']