In [2]:
# import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from ast import literal_eval

In [4]:
# import dataset
bb_ratings = pd.read_csv("data/azure/BB_user_ratings.csv")

### User-Item Rating Triplet

In [6]:
data_triplet = bb_ratings[["ACCOUNT_NUM.hash","package","ratings"]]
data_triplet

Unnamed: 0,ACCOUNT_NUM.hash,package,ratings
0,000eee57a6c7a02c8aca8b410ea2e287,WEB STARTER,5.8125
1,000f325a72b9d24742237070939b57d1,WEB STARTER,2.6875
2,00103d1ae201c2ddb33b965f44f280b1,FTTH_WEB FAMILY PLUS,8.0000
3,0011c1b02e2403c74c75ae8b5582e018,ANY JOY,1.7500
4,0011c1b02e2403c74c75ae8b5582e018,WEB LITE,1.7500
...,...,...,...
20398,ffe2bae6a0c7c4b20dec84f505cec1b3,STUDENT PACKAGE 1,1.9375
20399,ffec2210afd448ef3cf76f20d6dc1814,ANY JOY,2.1875
20400,ffee4f0542d8f581b56b6b3dc7e2c178,STUDENT PACKAGE 1,0.5625
20401,fff45bfbbb99fe43c7ef3f2c0a7bb805,FTTH_WEB FAMILY XTRA,5.2500


## Interaction Matrix

In [7]:
# create the user item matrix using the ratings dataset - Hint: try using pivot function 
interactions_metrix = data_triplet.pivot_table(index="ACCOUNT_NUM.hash", columns="package", values="ratings")
interactions_metrix

package,ANY BEAT,ANY BLAZE,ANY FLIX,ANY JOY,ANY SPIKE,ANY STORM,ANY TIDE,BROADBAND EXPERIENCE,FTTH_ANY BLAZE,FTTH_ANY DELIGHT,...,WEB CHAMP,WEB FAMILY ACTIVE,WEB FAMILY PLUS,WEB FAMILY XTRA,WEB LITE,WEB MASTER,WEB PAL,WEB PRO,WEB STARTER,XCITE
ACCOUNT_NUM.hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000eee57a6c7a02c8aca8b410ea2e287,,,,,,,,,,,...,,,,,,,,,5.8125,
000f325a72b9d24742237070939b57d1,,,,,,,,,,,...,,,,,,,,,2.6875,
00103d1ae201c2ddb33b965f44f280b1,,,,,,,,,,,...,,,,,,,,,,
0011c1b02e2403c74c75ae8b5582e018,,,,1.7500,,,,,,,...,,,,,1.75,,,,,
0012377a9a647a0119ba84158127ae8f,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ffe2bae6a0c7c4b20dec84f505cec1b3,,,,,,,,,,,...,,,,,,,,,,
ffec2210afd448ef3cf76f20d6dc1814,,,,2.1875,,,,,,,...,,,,,,,,,,
ffee4f0542d8f581b56b6b3dc7e2c178,,,,,,,,,,,...,,,,,,,,,,
fff45bfbbb99fe43c7ef3f2c0a7bb805,,,,,,,,,,,...,,,,,,,,,,


In [8]:
#interactions_metrix.drop(["4G"],axis=1, inplace=True)

In [9]:
#interactions_metrix.reset_index(inplace=True)
interactions_metrix = interactions_metrix.fillna(0)

## Finding similar users

In [10]:
users = data_triplet["ACCOUNT_NUM.hash"].unique()
active_user = "0011c1b02e2403c74c75ae8b5582e018"

In [11]:
# using cosine_similarity from sklearn
def get_similar_users(user_id, interactions_matrix):
    # compute similarity of each user to the provided user
    similarity = []
    for user in users:
        sim = cosine_similarity([interactions_metrix.loc[user_id]], [interactions_metrix.loc[user]])
        similarity.append((user, sim))
    # sort by similarity
    similarity.sort(key=lambda x: x[1], reverse=True)
    # create list of just the user ids
    most_similar_users = [tup[0] for tup in similarity]
    # create list of similarity score
    similarity_score = [tup[1] for tup in similarity]
    # remove the user's own id
    most_similar_users.remove(user_id)
    # remove the user's own similarity score
    similarity_score.remove(similarity_score[0])  
    return most_similar_users, similarity_score


# Business Rule -> Single  Play canot be recommended to a Double Play / Triple Play user
def play_rule(already_interacted):
    for s in already_interacted.copy():
        if "DOUBLE_PLAY" in s or "TRIPLE_PLAY" in s:
            already_interacted.add("SINGLE_PLAY")
        if "TRIPLE_PLAY" in s:
            already_interacted.add("DOUBLE_PLAY_BV")
            already_interacted.add("DOUBLE_PLAY_PV")
        # ADD Upgrade/ Downgrade Rule
    return already_interacted


def recommendations(user_id, num_of_packages, user_item_interactions):
    # find the most similar users to the user_id for which we want to recommend packages
    most_similar_users = get_similar_users(user_id, user_item_interactions)[0]
    # find out those packages which this user has already interacted with
    packages = set(list(interactions_metrix.columns[np.where(interactions_metrix.loc[user_id]>0)]))
    # create an empty list to store the recommended packages
    recommendations = []
    # copy those packages which are already interacted by user_id
    already_interacted = packages.copy()
    # loop through each similar user from the list of most_similar_users
    for similar_user in most_similar_users:
        # implement the below code till the length of recommended packages does not become equal to num_of_packages
        if len(recommendations) < num_of_packages:
            # store all the packages interacted by each similar user to user_id
            similar_user_packages = set(list(interactions_metrix.columns[np.where(interactions_metrix.loc[similar_user]>0)]))
            # applying business rule
            already_interacted= play_rule(already_interacted)
            # add those packages in the recommended list which are present in similar_user_packages but not present in already_interacted
            recommendations.extend(list(similar_user_packages.difference(already_interacted)))
            # now add all those packages into already_interacted which we already added in recommendations
            already_interacted = already_interacted.union(similar_user_packages)
        else:
            break
    return recommendations[:num_of_packages]

### Predictions

In [12]:
# check the implemented function above 
#[0] is here to get first returned element of the function (most_similar_users)
#[:10] is to get the first 10 (the 10 most) similar users
similar_users = get_similar_users(active_user, interactions_metrix)[:10]

In [13]:
similar_users[0][:10]

['041b03bcafbb0e6540d55b5efb8b2b2a',
 '05669f454388b4f87124c5cc01d18ab7',
 '0576948a36cd84d7638cbc73428670fd',
 '05f817d771d17d096f2e601524c8d19f',
 '060e1635890f03de99da85744f315d33',
 '06a2230ef7955dd179b977774ae49d54',
 '08a7b94f3eb43dce5e0af7e0aaa4863f',
 '08e89c6e217c90b8064b454ad512e4eb',
 '090ad6f455d84e667cfa4d2f98f27afa',
 '0b65ee412a24d44080caa4c3c497549d']

In [14]:
# this gives similarity scores (how similar a user to a given user)
similar_users[1][:10]

[array([[1.]]),
 array([[1.]]),
 array([[1.]]),
 array([[1.]]),
 array([[1.]]),
 array([[1.]]),
 array([[1.]]),
 array([[1.]]),
 array([[1.]]),
 array([[1.]])]

In [15]:
data_triplet[(data_triplet['ACCOUNT_NUM.hash'] == active_user)]

Unnamed: 0,ACCOUNT_NUM.hash,package,ratings
3,0011c1b02e2403c74c75ae8b5582e018,ANY JOY,1.75
4,0011c1b02e2403c74c75ae8b5582e018,WEB LITE,1.75


In [16]:
# check recommendations
recommendations(active_user,1, interactions_metrix)

['ANY BEAT']

In [18]:
# Apply for all users in DF
final_df = data_triplet.groupby("ACCOUNT_NUM.hash").agg({"package":list})
final_df.reset_index(inplace=True)

# only getting results for first 30, due to performance issues. Need to run for whole dataset on cloud
final_df = final_df.iloc[:30]

In [None]:
final_df["recommendation"] = final_df["ACCOUNT_NUM.hash"].apply(lambda x:recommendations(x,1,interactions_metrix))

In [None]:
# save scored dataset
final_df.to_csv("data/cross-selling-scored/cosine-sim/bb_scored_sample.csv")