In [11]:
from datetime import datetime
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense # for hidden layers

Using TensorFlow backend.


In [78]:


"""Load data"""
data = pd.read_excel("zurich_insurance.xlsx")
data.head() # first 5 lines
data.tail() # show only the last 5 lines

"""Preprocessing"""
# Obtaining a Complete Dataset (Dropping Missing Values)
data = data.dropna()
data = data.reset_index(drop = True)
# data.describe()

# Change column to obtain the years the customer has been with the company
current_time = datetime.now()
customer_since = data['Customer since']
i = 0
time_since = [None] * len(data.index)
for idx in data.index:
    diff_time = current_time.timestamp() - customer_since[idx].timestamp() # in seconds
    time_since[i] = round(diff_time/60/60/24/30/12) # in years
    i+=1

data['Customer since'] = time_since
data = data.rename(columns = {'Customer since': 'Years customer'})

# Split data into predictors and outcome
X = data.iloc[:, 3:20] # from age on
y = data.iloc[:, 20] # variable to be predicted

labelencoder = LabelEncoder() 
canton = labelencoder.fit_transform(data['Canton'])
onehotencoder = OneHotEncoder(categorical_features = [0])
canton = onehotencoder.fit_transform(canton.reshape(-1,1)).toarray()
canton = canton[:, 1:] # avoid falling into dummy variable trap (e.g. if its not male is female)

encoded = pd.DataFrame(canton)
X = pd.concat([X, encoded], axis=1)

labelencoder = LabelEncoder()
X['Gender'] = labelencoder.fit_transform(X['Gender'])

# encode  output class values as integers
encoder = LabelEncoder()
encoded_y = encoder.fit_transform(y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_y)

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, dummy_y, test_size = 0.2, random_state = 0)

# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Part 2 - Now let's make the ANN!

# define baseline model



#from sklearn.externals import joblib
## save the model to disk
#filename = 'finalized_model.pkl'
#joblib.dump(estimator, filename)
#
## some time later...
# 
## load the model from disk
#loaded_model = joblib.load(filename)
#result = loaded_model.score(X_test, y_test)
#print(result)




In [79]:
X

Unnamed: 0,Age,Gender,Years customer,No. line of business,AKT,General,Life,AM,AS,KA,...,16,17,18,19,20,21,22,23,24,25
0,55.0,0,13,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,35.0,0,7,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,38.0,0,4,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,38.0,1,22,2.0,2.0,2.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,65.0,1,41,4.0,4.0,3.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,74.0,1,59,3.0,3.0,3.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6,36.0,0,8,2.0,2.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,58.0,1,38,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,78.0,1,30,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9,43.0,1,21,3.0,4.0,2.0,2.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [6]:
def baseline_model():
    # Initialising the ANN
    classifier = Sequential()
    
    # Adding the input layer and the first hidden layer
    classifier.add(Dense(units = 24, kernel_initializer = 'uniform', activation = 'relu', input_dim = 43))
    
    # Adding the second hidden layer
    classifier.add(Dense(units = 24, kernel_initializer = 'uniform', activation = 'relu'))
    
    # Adding the output layer
    classifier.add(Dense(units = 4, kernel_initializer = 'uniform', activation = 'softmax'))
    
    # Compiling the ANN
    classifier.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
    return classifier


estimator = KerasClassifier(build_fn=baseline_model, epochs=1, batch_size=10, verbose=0)

kfold = KFold(n_splits=2, shuffle=True, random_state=0)


In [21]:
results = cross_val_score(estimator, X, dummy_y, cv=kfold)

print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Baseline: 69.97% (0.49%)


In [7]:
p = baseline_model()
p.fit(X, dummy_y)


Epoch 1/1


<keras.callbacks.History at 0x7f17bc42d748>

In [14]:
lp = p.predict_classes(X[:43])

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, dummy_y, test_size = 0.2, random_state = 0)


array([[ 0.4596366 ,  0.83236433,  0.82848122, ..., -0.13543224,
        -0.14699557, -0.88427083],
       [ 1.16040615, -1.20139698, -0.34658596, ..., -0.13543224,
        -0.14699557, -0.88427083],
       [-0.50392154,  0.83236433, -0.89495065, ..., -0.13543224,
        -0.14699557,  1.13087526],
       ...,
       [ 0.4596366 ,  0.83236433,  1.06349466, ..., -0.13543224,
        -0.14699557, -0.88427083],
       [ 0.10925182,  0.83236433, -1.12996409, ..., -0.13543224,
        -0.14699557, -0.88427083],
       [-0.46012344,  0.83236433, -0.65993721, ..., -0.13543224,
        -0.14699557, -0.88427083]])

In [20]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [44]:
X_train

Unnamed: 0,Age,Gender,Years customer,No. line of business,AKT,General,Life,AM,AS,KA,...,16,17,18,19,20,21,22,23,24,25
18098,63.0,1,29,2.0,2.0,2.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24117,79.0,0,14,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9131,41.0,1,7,3.0,4.0,3.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
15097,89.0,1,26,1.0,2.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
10017,39.0,1,13,10.0,5.0,4.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1329,52.0,1,21,4.0,5.0,4.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
19530,32.0,0,34,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21327,61.0,1,26,3.0,3.0,3.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
11144,43.0,0,23,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12147,32.0,1,6,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [252]:
layout = go.Layout(
    title='Recomended policies for client',
    xaxis=dict(
        tickfont=dict(
            size=14,
            color='rgb(107, 107, 107)'
        )
    ),
    yaxis=dict(
        title='',
        titlefont=dict(
            size=16,
            color='rgba(100, 100, 100, 1)',
        ),
        tickfont=dict(
            size=14,
            color='rgba(100, 100, 100, 1)',
        )
    ),
    legend=dict(
        x=0,
        y=1.2,
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)'
    ),
    barmode='stack',
    bargap=0.15,
    bargroupgap=0.1
)

In [255]:
"""Load data"""
data = pd.read_excel("zurich_insurance.xlsx")
data.head() # first 5 lines
data.tail() # show only the last 5 lines

"""Preprocessing"""
# Obtaining a Complete Dataset (Dropping Missing Values)
data = data.dropna()
data = data.reset_index(drop = True)

data[8:17]

Unnamed: 0,Zip code,Canton,Birth date,Age,Gender,Customer since,No. line of business,AKT,General,Life,...,AS,KA,MF,PL,PN,RS,SH,SL,REST,Customer Lifetime Value
0,8046,ZH,1962-04-15,55.0,Female,2005-09-02,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,C
1,9630,SG,1982-02-25,35.0,Female,2011-12-28,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,C
2,6436,SZ,1979-08-07,38.0,Female,2015-01-02,1.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,C
3,8702,ZH,1979-03-03,38.0,Male,1997-04-03,2.0,2.0,2.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,C
4,5616,AG,1952-02-19,65.0,Male,1977-09-02,4.0,4.0,3.0,1.0,...,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,A
5,8607,ZH,1942-12-07,74.0,Male,1960-01-02,3.0,3.0,3.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,A
6,6204,LU,1981-02-02,36.0,Female,2010-02-27,2.0,2.0,2.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,A
7,9200,SG,1959-04-26,58.0,Male,1981-03-14,1.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,B
8,8047,ZH,1939-06-19,78.0,Male,1988-11-02,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,C
9,1052,VD,1973-10-13,43.0,Male,1998-03-02,3.0,4.0,2.0,2.0,...,1.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,A


In [256]:

import scipy.sparse as sp
from scipy.sparse.linalg import svds
import pandas as pd
import numpy as np

"""Load data"""
data = pd.read_excel("zurich_insurance.xlsx")
data.head() # first 5 lines
data.tail() # show only the last 5 lines

"""Preprocessing"""
# Obtaining a Complete Dataset (Dropping Missing Values)
data = data.dropna()
data = data.reset_index(drop = True)

pre = 8
service_matrix = X.as_matrix()[:,pre:17]

def recommend(adds, service_matrix=service_matrix, k=3):
    if adds:
        service_matrix = np.insert(service_matrix, service_matrix.shape[0], adds, axis=0)
    print(service_matrix.shape)
    #get SVD components from train matrix. Choose k.
    u, s, vt = svds(service_matrix, k = k)
    s_diag_matrix=np.diag(s)
    X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
    print('User-based CF MSE: ' + str(rmse(X_pred, service_matrix)))
    return X_pred, service_matrix

# def 



Method .as_matrix will be removed in a future version. Use .values instead.



In [243]:
service_matrix = np.insert(service_matrix, -1, new_client, axis=0)
service_matrix.shape
new_mat[-1]

array([1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0.])

In [250]:

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go


new_client = [1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
new_client = [1, 0.5, 1, 0.5, 0.5, 1, 0.5, 0.5, 0.5, 0.5, 0.5]


X_pred, new_mat = recommend(new_client)


ins = X.columns[pre:18]
idx = range(len(ins))

client = -1
recomended = X_pred[client,:]
base = new_mat[client,:]


recomended *= base>0
recomended *= base<1
base[base==.5] = 0
print(" Most recommended package %s " % ins[np.argmax(recomended)])

trace1 = go.Bar(
    x=ins,
    y=base,
    name='Actual products',
    marker=dict(
        color='rgba(204,204,204,1)'
    )
)

trace2 = go.Bar(
    x=ins,
    y=recomended,
    name='Recomended products',
    marker=dict(
        color='rgba(222,45,38,0.8)'
    )
)

data = [trace1, trace2]


fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='style-bar')
init_notebook_mode(connected=True)




(24291, 11)
User-based CF MSE: 0.32365856492644896
 Most recommended package SH 


In [149]:
service_matrix

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [58]:
ins

Index(['No. line of business',                  'AKT',              'General',
                       'Life',                   'AM',                   'AS',
                         'KA',                   'MF',                   'PL',
                         'PN',                   'RS',                   'SH',
                         'SL',                 'REST',                      0],
      dtype='object')

In [201]:
from polara.recommender.data import RecommenderData
from polara.recommender.models import SVDModel
from polara.datasets.movielens import get_movielens_data
# get data and convert it into appropriate format
ml_data = get_movielens_data(get_genres=False)
data_model = RecommenderData(ml_data, 'userid', 'movieid', 'rating')
# build PureSVD model and evaluate it
svd = SVDModel(data_model)
svd.build()
svd.evaluate()

Preparing data...
18 unique movieid's within 26 testset interactions were filtered. Reason: not in the training data.
1 unique movieid's within 1 holdout interactions were filtered. Reason: not in the training data.
1 of 1208 userid's were filtered out from holdout. Reason: not enough items.
1 userid's were filtered out from testset. Reason: inconsistent with holdout.
Done.
There are 807458 events in the training and 3621 events in the holdout.
PureSVD training time: 0.12940835900008096s


Hits(true_positive=1123, false_positive=10947, true_negative=None, false_negative=2498)

In [204]:
ml_data

Unnamed: 0,userid,movieid,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
5,1,1197,3
6,1,1287,5
7,1,2804,5
8,1,594,4
9,1,919,4
