In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
warnings.filterwarnings("ignore")
%matplotlib inline

# Preprocessing

### Creating Modelling Datasets

In [None]:
def combine_p_cats(df):
    temp = df
    for p_cat in ['p_cat_1', 'p_cat_2', 'p_cat_3']:
        ranges = pd.unique(pd.Series(temp[p_cat]))
        ranges.sort()
        mapped = ['p_cat_' + str(x) for x in ranges]
        temp['p_cat'] = temp[p_cat].map(dict(zip(ranges, mapped)))
        one_hot = pd.get_dummies(temp['p_cat'])
        temp = temp.drop(p_cat, axis = 1)
        one_hot = one_hot.replace(0, np.nan)
        temp = temp.combine_first(one_hot)
    temp = temp.fillna(0)
    temp = temp.drop(['p_cat', 'p_cat_0'], axis = 1)

    p_cols = ['p_cat_1', 'p_cat_10', 'p_cat_11', 'p_cat_12', 'p_cat_13', 'p_cat_14',
   'p_cat_15', 'p_cat_16', 'p_cat_17', 'p_cat_18', 'p_cat_2', 'p_cat_3',
   'p_cat_4', 'p_cat_5', 'p_cat_6', 'p_cat_7', 'p_cat_8', 'p_cat_9']

    temp[p_cols] = temp[p_cols].astype(int)
    temp[p_cols] = temp[p_cols].astype(int)
    return temp

def write_model_dataset(chosen, combine_cats = False):
    raw_df = pd.read_csv("inputs/BlackFriday.csv")
    modeldf = raw_df.rename(columns =   {'User_ID' : 'u_id',
                                'Product_ID' : 'p_id',
                                'Gender' : 'gender',
                                'Age' : 'age',
                                'Occupation' : 'occ',
                                'City_Category' : 'city',
                                'Stay_In_Current_City_Years' : 'years_in_city',
                                'Marital_Status' : 'married',
                                'Product_Category_1' : 'p_cat_1',
                                'Product_Category_2' : 'p_cat_2',
                                'Product_Category_3' : 'p_cat_3',
                                'Purchase' : 'pur'})
    file_name = ""
    cols = np.asarray(modeldf.columns)
    
    # normalize user_id and prod_id, impute 0 into NaN p_cat
    modeldf['u_id'] = modeldf['u_id'] - 1000000
    modeldf['p_id'] = modeldf['p_id'].replace('[A-Za-z]', '', regex = True)
    modeldf['p_id'] = pd.to_numeric(modeldf.p_id, errors='coerce')
    modeldf['p_cat_2'].fillna(0, inplace = True)
    modeldf['p_cat_3'].fillna(0, inplace = True)
    modeldf['p_cat_2'] = modeldf['p_cat_2'].astype(int)
    modeldf['p_cat_3'] = modeldf['p_cat_3'].astype(int)
    
    # Creates dataset with imputed 0s and labelvector on age
    if chosen == 0:
        # uncomment these 2 lines to replace 0's with mean
        #modeldf['p_cat_2'].replace(0, modeldf['p_cat_2'].mean(), inplace = True)
        #modeldf['p_cat_3'].replace(0, modeldf['p_cat_3'].mean(), inplace = True)
        label_features = ['gender', 'age', 'years_in_city', 'city']
        for feature in label_features:
            ranges = pd.unique(pd.Series(modeldf[feature]))
            ranges.sort()
            mapped = [x for x in range(len(ranges))]
            modeldf[feature] = modeldf[feature].map(dict(zip(ranges, mapped)))
        file_name = "minimal_preprocess.csv"
    elif chosen == 1:
        # Creates dataset with one hot vectors on occ, city, p_cats
        
        one_hot_features = []
        if combine_cats:
            one_hot_features = ['occ', 'city']
        else:
            one_hot_features = ['occ', 'city', 'p_cat_1', 'p_cat_2', 'p_cat_3']
        
        for feature in one_hot_features:
            if feature in cols:
                ranges = pd.unique(pd.Series(modeldf[feature]))
                ranges.sort()
                mapped = [feature + '_' + str(x) for x in ranges]
                modeldf[feature] = modeldf[feature].map(dict(zip(ranges, mapped)))
                one_hot = pd.get_dummies(modeldf[feature])
                modeldf = modeldf.drop(feature, axis = 1)
                modeldf = modeldf.join(one_hot) 

        if combine_cats:
            modeldf = combine_p_cats(modeldf)
        
        label_features = ['gender', 'age', 'years_in_city']

        for feature in label_features:
            ranges = pd.unique(pd.Series(modeldf[feature]))
            ranges.sort()
            mapped = [x for x in range(len(ranges))]
            modeldf[feature] = modeldf[feature].map(dict(zip(ranges, mapped)))
        file_name = "some_one_hot.csv"
        
    elif chosen == 2:
        # creates dataset with one hot vectors on all categorical features
        cat_features = []
        if combine_cats:
            cat_features = ['age', 'occ', 'city', 'years_in_city']
        else:
            cat_features = ['age', 'occ', 'city', 'years_in_city', 'p_cat_1', 'p_cat_2', 'p_cat_3']
            
        for cat in cat_features:
            if cat in cols:
                ranges = pd.unique(pd.Series(modeldf[cat]))
                ranges.sort()
                mapped = [cat + '_' + str(x) for x in range(len(ranges))]
                modeldf[cat] = modeldf[cat].map(dict(zip(ranges, mapped)))
                one_hot = pd.get_dummies(modeldf[cat])
                modeldf = modeldf.drop(cat, axis = 1)
                modeldf = modeldf.join(one_hot)
        
        if combine_cats:
            modeldf = combine_p_cats(modeldf)
        
        label_features = ['gender']

        for feature in label_features:
            ranges = pd.unique(pd.Series(modeldf[feature]))
            ranges.sort()
            mapped = [str(x) for x in range(len(ranges))]
            modeldf[feature] = modeldf[feature].map(dict(zip(ranges, mapped)))
        file_name = "all_one_hot.csv"
    if len(file_name) != 0:  
        if chosen != 0 and combine_cats == True:
            modeldf.to_csv("inputs/p_combined_" + file_name , encoding='utf-8', index = False)
        else:
            modeldf.to_csv("inputs/" + file_name , encoding='utf-8', index = False)
    
        
    return modeldf

In [None]:
# minimal_preprocess = 0
# some_one_hot = 1
# all_one_hot = 2
# True = combine product categories

write_model_dataset(2, True).head()