In [1]:
import multiprocessing as mp
import pandas as pd
import numpy as np
from tqdm import tqdm
from funk_svd import SVD

import math

In [2]:
train = pd.read_csv('train_ver2.csv')
test = pd.read_csv('test_ver2.csv')

  train = pd.read_csv('train_ver2.csv')
  test = pd.read_csv('test_ver2.csv')


In [3]:
train.head()

Unnamed: 0,fecha_dato,ncodpers,ind_empleado,pais_residencia,sexo,age,fecha_alta,ind_nuevo,antiguedad,indrel,...,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
0,2015-01-28,1375586,N,ES,H,35,2015-01-12,0.0,6,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0
1,2015-01-28,1050611,N,ES,V,23,2012-08-10,0.0,35,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0
2,2015-01-28,1050612,N,ES,V,23,2012-08-10,0.0,35,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0
3,2015-01-28,1050613,N,ES,H,22,2012-08-10,0.0,35,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0
4,2015-01-28,1050614,N,ES,V,23,2012-08-10,0.0,35,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0


In [4]:
def transform_age(age):
    if isinstance(age, str):
        try: 
            return int(age)
        except:
            return 18
    return age

def transform_income(income):
    if isinstance(income, str):
        try:
            return float(income)
        except:
            return 1000
    return float(income)

def transform_value(df):
    df['age'] = df['age'].apply(lambda x: transform_age(x))
    df['renta'] = df['renta'].apply(lambda x: transform_income(x)) 
    return df

test_df = transform_value(test)
train_df = transform_value(train)

In [5]:
def split_and_train(df, test_df):
    
    df_1 = df[(df['age'] >= 18) & (df['age'] <= 30) & (df['renta'] <= 250000)]
    df_test_1 = test_df[(test_df['age'] <= 30) & (test_df['renta'] <= 250000)]
    
    df_2 = df[(df['age'] > 30) & (df['renta'] <= 250000)]
    df_test_2 = test_df[(test_df['age'] > 30) & (test_df['renta'] <= 250000)]
    
    df_3 = df[(df['age'] >= 18) & (df['age'] <= 30) & (df['renta'] > 250000)]
    df_test_3 = test_df[(test_df['age'] <= 30) & (test_df['renta'] > 250000)]
    
    df_4 = df[(df['age'] > 30) & (df['renta'] > 250000)]
    df_test_4 = test_df[(test_df['age'] > 30) & (test_df['renta'] > 250000)]
    
    return [[df_1, df_test_1], [df_2, df_test_2], [df_3, df_test_3], [df_4, df_test_4]]

In [6]:
mapping_product = {}
for idx, product in enumerate(train.columns[-24:]):
    mapping_product[idx] = product
    
product_columns = train.columns[-24:]
selected_columns = product_columns.tolist()
selected_columns.append('ncodpers')

In [7]:
from surprise import Dataset
from surprise import Reader
from surprise.prediction_algorithms.matrix_factorization import NMF
from sklearn.metrics import mean_squared_error, mean_absolute_error
# from sklearn.model_selection import train_test_split
reader = Reader(rating_scale=(1, 5))

In [16]:
def construct_dataset(train, test):
    u_id_test, i_id_test = [], []
    print(f'Creating test set...')
    for id in tqdm(test['ncodpers'].unique()):
        u_id_test.extend([id]*len(product_columns))
        i_id_test.extend([i for i in range(len(product_columns))])

    test_df = pd.DataFrame(zip(u_id_test, i_id_test), columns=['u_id', 'i_id'])
    
    print(f'Creating train set...')
    train = train[selected_columns]

    u_id, i_id, rating = [], [], []
    train_group_uid = train.groupby('ncodpers').mean()
    for id in tqdm(train['ncodpers'].unique()):
        user_pass_value = train_group_uid.loc[id]

        rating_value = user_pass_value[product_columns].values*5

        indices = np.where(rating_value > 0)[0]
        values = rating_value[indices]

        u_id.extend([id] * len(indices))
        i_id.extend(indices.tolist())
        rating.extend(values.tolist())

    train_df = pd.DataFrame(zip(u_id, i_id, rating), columns=['u_id', 'i_id', 'rating'])
    # train_df, dev_df = train_test_split(train_df, test_size=0.2, random_state=42)
    return train_df, test_df

def fit_eval(train_df, test_df, algo):
    if algo == 'SVD':
        model = SVD(lr=0.001, reg=0.005, n_epochs=100, n_factors=100,
                  early_stopping=True, shuffle=False, min_rating=1, max_rating=10)
        print(f'******Training on dataset with {algo}******')
        model.fit(X=train_df)
        #####Evaluation
        prediction = model.predict(train_df)
        print(f'******Evaluation on trainset with {algo}******')
        print(f"""RMSE: {math.sqrt(mean_squared_error(train_df['rating'].tolist(), prediction))}""")
        print(f"""MAE: {mean_absolute_error(train_df['rating'].tolist(), prediction)}""")
        #####Inference
        prediction = model.predict(test_df)
        
    elif algo == 'NMF':
        trainset = Dataset.load_from_df(train_df[['u_id', 'i_id', 'rating']],
                                       reader=reader).build_full_trainset()
        model = NMF(n_factors=100, n_epochs=100)
        model = model.fit(trainset)
        prediction = []
        for uid, iid in zip(train_df['u_id'].tolist(), train_df['i_id'].tolist()):
            prediction.append(model.predict(uid=uid, iid=iid).est)
        print(f'******Evaluation on dataset with {algo}******')
        print(f"""RMSE: {math.sqrt(mean_squared_error(train_df['rating'].tolist(), prediction))}""")
        print(f"""MAE: {mean_absolute_error(train_df['rating'].tolist(), prediction)}""")
        
        prediction = []
        print(f'NMF inference time will take quite slow...')
        for i in tqdm(range(len(test_df))):
            sample = test_df.loc[i]
            prediction.append(model.predict(uid=sample['u_id'], iid=sample['i_id']).est)
    
    return prediction

In [17]:
def get_answer_file(train, test, all_in_one=False, algo='SVD'):
    status = 'all_in_one' if all_in_one else 'post_clustering' 
    file_name = f'{algo}_{all_in_one}.csv'
    with open(file_name, 'w') as f:
        f.write('ncodpers,added_products')
        f.write('\n')
        if not all_in_one:
            list_df = split_and_train(train, test)
            for num_stages, cluster in enumerate(list_df):
                print(f'Running cluster {num_stages}/{len(list_df)} in the dataset...')
                train, test = cluster[0], cluster[1]
                print(f'Reconstruct dataset for training & Evaluation...')
                train_df, test_df = construct_dataset(train, test)
                print(f'Training & Evaluation & Inference stage....')
                prediction = fit_eval(train_df, test_df, algo)
                
                for i, id in tqdm(enumerate(test_df['u_id'].unique().tolist())):
                    u_id_rating = prediction[24*i: 24*(i+1)]
                    top_indices = np.argsort(u_id_rating)[-2:].tolist()
                    product = f'{mapping_product[top_indices[0]]}'
                    f.write(f'{id}, {product}')
                    f.write('\n')
        else:
            print(f'Reconstruct dataset for training & evaluation...')
            train_df, test_df = construct_dataset(train, test)
            print(f'Training & Evaluation & Inference stage....')
            prediction = fit_eval(train_df, test_df, algo)
            
            for i, id in tqdm(enumerate(test_df['u_id'].unique().tolist())):
                u_id_rating = prediction[24*i: 24*(i+1)]
                top_indices = np.argsort(u_id_rating)[-2:].tolist()
                product = f'{mapping_product[top_indices[0]]}'
                f.write(f'{id}, {product}')
                f.write('\n')
                
get_answer_file(train_df , test_df, all_in_one=True, algo='SVD')

Reconstruct dataset for training & evaluation...
Creating test set...


100%|██████████| 929615/929615 [00:01<00:00, 616162.19it/s]


Creating train set...


100%|██████████| 956645/956645 [02:32<00:00, 6282.45it/s]


Training & Evaluation & Inference stage....
******Training on dataset with SVD******
Preprocessing data...

Epoch 1/100  | took 0.2 sec
Epoch 2/100  | took 0.2 sec
Epoch 3/100  | took 0.2 sec
Epoch 4/100  | took 0.2 sec
Epoch 5/100  | took 0.2 sec
Epoch 6/100  | took 0.2 sec
Epoch 7/100  | took 0.2 sec
Epoch 8/100  | took 0.2 sec
Epoch 9/100  | took 0.2 sec
Epoch 10/100 | took 0.2 sec
Epoch 11/100 | took 0.2 sec
Epoch 12/100 | took 0.2 sec
Epoch 13/100 | took 0.2 sec
Epoch 14/100 | took 0.2 sec
Epoch 15/100 | took 0.2 sec
Epoch 16/100 | took 0.2 sec
Epoch 17/100 | took 0.2 sec
Epoch 18/100 | took 0.2 sec
Epoch 19/100 | took 0.2 sec
Epoch 20/100 | took 0.2 sec
Epoch 21/100 | took 0.2 sec
Epoch 22/100 | took 0.2 sec
Epoch 23/100 | took 0.2 sec
Epoch 24/100 | took 0.2 sec
Epoch 25/100 | took 0.2 sec
Epoch 26/100 | took 0.2 sec
Epoch 27/100 | took 0.2 sec
Epoch 28/100 | took 0.2 sec
Epoch 29/100 | took 0.2 sec
Epoch 30/100 | took 0.2 sec
Epoch 31/100 | took 0.2 sec
Epoch 32/100 | took 0.2 

929615it [00:05, 161368.84it/s]
