In [111]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from ast import literal_eval
from surprise import *
from surprise import accuracy
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from collections import defaultdict
from __future__ import division

In [78]:
user_profile = pd.read_csv("data/user_profile/prepared/User_Profile_Null_Handled.csv")

In [79]:
user_profile["Sub_Update_Status"].value_counts()

NO_INFO                30250
Promotion Upgrade       7369
Promotion Downgrade     4975
Name: Sub_Update_Status, dtype: int64

In [80]:
data=user_profile.iloc[:,[1,3,11,14,18,23,25,34,57,61,62]]

In [81]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42594 entries, 0 to 42593
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ACCOUNT_NUM.hash       42594 non-null  object 
 1   Voice_INCOMING_Scaled  42594 non-null  float64
 2   Voice_OUTGOING_Scaled  42594 non-null  float64
 3   IDD_INCOMING_Scaled    42594 non-null  float64
 4   IDD_OUTGOING_Scaled    42594 non-null  float64
 5   PEO_TV_Scaled          42594 non-null  float64
 6   BB_Scaled              42594 non-null  float64
 7   Sub_Type               42594 non-null  object 
 8   Product_Type           42594 non-null  object 
 9   Sub_Update_Status      42594 non-null  object 
 10  Sub_Update             42594 non-null  object 
dtypes: float64(6), object(5)
memory usage: 3.6+ MB


In [82]:
data_dim=data.iloc[:,1:7]

In [83]:
pc=PCA(n_components=6) 
pc.fit(data_dim)

PCA(n_components=6)

In [84]:
#How mucb variance, captured together
pc.explained_variance_ratio_.cumsum()

array([0.53436097, 0.85728507, 0.94419659, 0.97438368, 0.99002141,
       1.        ])

In [85]:
### Run PCA on the data and reduce the dimensions in pca_num_components dimensions
pca = PCA(n_components=1)
pca.fit(data_dim)
reduced_data = pca.fit_transform(data_dim)
results_df = pd.DataFrame(reduced_data,columns=['usage'])

In [86]:
# applying min-max-scaler to reduced features
scaler = MinMaxScaler()
results_df[['usage']] = scaler.fit_transform(results_df[['usage']])

In [87]:
data=pd.concat([data,results_df],axis=1)

In [88]:
data_triplet = data[["ACCOUNT_NUM.hash","Sub_Type","usage"]]
data_triplet

Unnamed: 0,ACCOUNT_NUM.hash,Sub_Type,usage
0,9efd917f18bb5c966953b4227ddbaf43,TRIPLE_PLAY,0.304039
1,2240d11b0d9ee8f6d9e8972c6190aa42,SINGLE_PLAY,0.006179
2,cff00495fd556a417e2838e27465d749,DOUBLE_PLAY_BV,0.309183
3,d00dacd8fc98c1726d6a4e9f28f8e579,TRIPLE_PLAY,0.154406
4,a3ee54787617510a616dfcb52c28941b,TRIPLE_PLAY,0.158567
...,...,...,...
42589,7052930957f02ad914dbd24fdcd04fed,DOUBLE_PLAY_BV,0.398034
42590,c217fb1b44642e79e56af9111e9c00b2,TRIPLE_PLAY,0.799978
42591,5e94646aa83bb9d7fab98620405517cb,DOUBLE_PLAY_BV,0.011400
42592,6801a82b67b7fb942242b7aca5ba022c,DOUBLE_PLAY_BV,0.525596


## Interaction Matrix

In [89]:
# create the user item matrix using the ratings dataset - Hint: try using pivot function 
interactions_metrix = data_triplet.pivot_table(index="ACCOUNT_NUM.hash", columns="Sub_Type", values="usage")
interactions_metrix

Sub_Type,4G,DOUBLE_PLAY_BV,DOUBLE_PLAY_PV,SINGLE_PLAY,TRIPLE_PLAY
ACCOUNT_NUM.hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0003102d10a11c8f2f0f241f00a36b2d,,,,,0.377181
0003df7598926f578f53ecd9e424f3c9,,0.274631,,,
00092b1fd47eb05d000a3553ae072056,,,,,0.364527
00094326efae1d73371d118204e12e46,,,0.110893,,
000ae59b7eb91e778f22e3f07495f4f9,,0.538300,,,
...,...,...,...,...,...
fff9148b195a844454cf5b07c2fccbcb,,,0.140922,,
fffaeeddaabbd2868c183d8884b2a98d,,,,,0.432613
fffd0b8541e9f62496092d2ea8609135,,,,,0.675962
ffff7fc0c266ac5b34f91de1ba4f3039,,,,,0.299851


In [90]:
#interactions_metrix.reset_index(inplace=True)
interactions_metrix = interactions_metrix.fillna(0)

In [91]:
X = interactions_metrix.values.T

In [92]:
X.shape

(5, 42594)

In [93]:
reader = Reader(rating_scale=(0, 1))
data_model = Dataset.load_from_df(data_triplet, reader)

In [94]:
trainset, testset = train_test_split(data_model, test_size=.20)

In [95]:
def get_Iu(uid):
    """Return the number of items rated by given user
    
    Args:
        uid: The raw id of the user.
    Returns:
        The number of items rated by the user.
    """
    
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError:  # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """Return the number of users that have rated given item
    
    Args:
        iid: The raw id of the item.
    Returns:
        The number of users that have rated the item.
    """
    
    try:
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:  # item was not part of the trainset
        return 0

def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est, true_r))
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:5]
    return top_n

## SVD Model

In [96]:
# We'll use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results
svd_validate = cross_validate(algo, data_model, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.2406  0.2390  0.2405  0.2393  0.2391  0.2397  0.0007  
MAE (testset)     0.1996  0.1978  0.2002  0.1988  0.1968  0.1986  0.0012  
Fit time          0.98    1.00    1.00    0.99    0.99    0.99    0.01    
Test time         0.02    0.02    0.02    0.02    0.02    0.02    0.00    


In [97]:
# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

In [98]:
top_n = get_top_n(predictions, n=10)
#print(top_n)
users_est = defaultdict(list)
users_true=defaultdict(list)
rec_for_user=defaultdict(list)
for uid, user_ratings in top_n.items():
    users_est[uid].append([est for (_, est,_) in user_ratings])
    users_true[uid].append([true_r for (_,_,true_r) in user_ratings])
    rec_for_user[uid].append([iid for (iid,_,_) in user_ratings])

In [99]:
# Let's build a pandas dataframe with all the predictions
df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])    
df['Iu'] = df.uid.apply(get_Iu)
df['Ui'] = df.iid.apply(get_Ui)
df['err'] = abs(df.est - df.rui)

In [100]:
# 10 Best predictions
df.sort_values(by='err')[:10]

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
4945,605cdee6e5ede97c93b137929f844ea8,TRIPLE_PLAY,0.436963,0.436889,{'was_impossible': False},0,12703,7.4e-05
1028,7544f2de49b1aeb814a06797d36182e4,TRIPLE_PLAY,0.437042,0.436889,{'was_impossible': False},0,12703,0.000153
3049,7bea3d218fa20dd0d14f15c2fe086124,TRIPLE_PLAY,0.437093,0.436889,{'was_impossible': False},0,12703,0.000204
598,6c579a7a733181f5c8a6db5e74e3c56f,TRIPLE_PLAY,0.436669,0.436889,{'was_impossible': False},0,12703,0.00022
2820,9d965b0f559940a90517767ed18f6815,TRIPLE_PLAY,0.436535,0.436889,{'was_impossible': False},0,12703,0.000354
82,cd4b6e67f393498c64247c63e006fa20,TRIPLE_PLAY,0.437245,0.436889,{'was_impossible': False},0,12703,0.000356
4839,907e7819d60d3b17dfa631ca73a1d223,DOUBLE_PLAY_BV,0.25889,0.258345,{'was_impossible': False},0,12637,0.000544
5587,a7d127d959994ae0871903d4c615d5e0,TRIPLE_PLAY,0.437441,0.436889,{'was_impossible': False},0,12703,0.000551
4825,c8f73fd6999092981cb37e1e39e9ef2e,TRIPLE_PLAY,0.436267,0.436889,{'was_impossible': False},0,12703,0.000623
5516,be40a6f53c6bc7f4740ee332b254eb8c,TRIPLE_PLAY,0.437574,0.436889,{'was_impossible': False},0,12703,0.000685


In [101]:
rmse = accuracy.rmse(predictions)
print("RMSE -->",rmse)
print("Accuracy -->",1-rmse)

RMSE: 0.2400
RMSE --> 0.23999348266701814
Accuracy --> 0.7600065173329819


In [102]:
data_triplet = data_triplet.merge(df[["uid","iid","err"]], left_on="ACCOUNT_NUM.hash", right_on ="uid", how="left")
data_triplet.dropna(subset=["uid"],inplace=True)
data_triplet.drop("uid", axis=1, inplace = True)
data_triplet.rename(columns={"iid":"SVD_recommendation","Sub_Type":"Actual_Subscription","err":"SVD_error"}, inplace = True)

## SlopeOne

In [104]:
# We'll use the SlopeOne algorithm.
algo = SlopeOne()

# Run 5-fold cross-validation and print results
so_validate =cross_validate(algo, data_model, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SlopeOne on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.2598  0.2579  0.2606  0.2615  0.2584  0.2597  0.0013  
MAE (testset)     0.2199  0.2192  0.2211  0.2221  0.2194  0.2203  0.0011  
Fit time          0.12    0.11    0.12    0.11    0.11    0.12    0.00    
Test time         0.02    0.02    0.02    0.02    0.02    0.02    0.00    


In [105]:
# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  algo.fit(trainset)


In [106]:
# Let's build a pandas dataframe with all the predictions
df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])    
df['Iu'] = df.uid.apply(get_Iu)
df['Ui'] = df.iid.apply(get_Ui)
df['err'] = abs(df.est - df.rui)

In [107]:
df.sort_values(by='err')[:10]

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
7313,baaf959f699bd905ed50c42e1da11092,DOUBLE_PLAY_BV,0.322392,0.322401,"{'was_impossible': True, 'reason': 'User and/o...",0,12637,9e-06
7678,15fb49a322620763d54f2cf938385b66,TRIPLE_PLAY,0.3223,0.322401,"{'was_impossible': True, 'reason': 'User and/o...",0,12703,0.000101
4373,5895122c6c5eb9808fdc8644fd623af5,TRIPLE_PLAY,0.322577,0.322401,"{'was_impossible': True, 'reason': 'User and/o...",0,12703,0.000176
836,45c5d101b6d907ae7d0ea1550e88a898,TRIPLE_PLAY,0.322183,0.322401,"{'was_impossible': True, 'reason': 'User and/o...",0,12703,0.000218
6082,c098ffe802e109446d25f806d6789f25,TRIPLE_PLAY,0.322643,0.322401,"{'was_impossible': True, 'reason': 'User and/o...",0,12703,0.000242
7827,71c5df8e36e6ca5e5b3a93ee3d1ccd0b,DOUBLE_PLAY_BV,0.322069,0.322401,"{'was_impossible': True, 'reason': 'User and/o...",0,12637,0.000332
8045,0332a28bd2556940b57581ef19d906bb,TRIPLE_PLAY,0.322758,0.322401,"{'was_impossible': True, 'reason': 'User and/o...",0,12703,0.000357
3482,dd67f6c2ee0d3e9b7b5dfe6d972ae95f,TRIPLE_PLAY,0.321935,0.322401,"{'was_impossible': True, 'reason': 'User and/o...",0,12703,0.000466
3151,a9d38ab56e34f14708ab8f470b0ca31f,TRIPLE_PLAY,0.32287,0.322401,"{'was_impossible': True, 'reason': 'User and/o...",0,12703,0.000469
4300,3057ae1f71c5c9399dcc9d63af734490,TRIPLE_PLAY,0.321901,0.322401,"{'was_impossible': True, 'reason': 'User and/o...",0,12703,0.0005


In [108]:
rmse = accuracy.rmse(predictions)
print("RMSE -->",rmse)
print("Accuracy -->",1-rmse)

RMSE: 0.2600
RMSE --> 0.2600306200842499
Accuracy --> 0.7399693799157501


In [109]:
data_triplet = data_triplet.merge(df[["uid","iid","err"]], left_on="ACCOUNT_NUM.hash", right_on ="uid", how="left")
data_triplet.dropna(subset=["uid"],inplace=True)
data_triplet.drop("uid", axis=1, inplace = True)
data_triplet.rename(columns={"iid":"SlopeOne_recommendation","err":"SlopeOne_error"}, inplace = True)

## Matrix factorization (NMF)

In [112]:
# We'll use the SlopeOne algorithm.
algo = NMF()

# Run 5-fold cross-validation and print results
nmf_validate =cross_validate(algo, data_model, measures=['RMSE', 'MAE'], cv=5, verbose=True)

ZeroDivisionError: float division

In [None]:
# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

In [None]:
# Let's build a pandas dataframe with all the predictions
df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])    
df['Iu'] = df.uid.apply(get_Iu)
df['Ui'] = df.iid.apply(get_Ui)
df['err'] = abs(df.est - df.rui)

In [None]:
df.sort_values(by='err')[:10]

In [None]:
rmse = accuracy.rmse(predictions)
print("RMSE -->",rmse)
print("Accuracy -->",1-rmse)

In [None]:
data_triplet = data_triplet.merge(df[["uid","iid","err"]], left_on="ACCOUNT_NUM.hash", right_on ="uid", how="left")
data_triplet.dropna(subset=["uid"],inplace=True)
data_triplet.drop("uid", axis=1, inplace = True)
data_triplet.rename(columns={"iid":"NMF_recommendation","err":"NMF_error"}, inplace = True)

In [114]:
data_triplet

Unnamed: 0,ACCOUNT_NUM.hash,Actual_Subscription,usage,SVD_recommendation,SVD_error,SlopeOne_recommendation,SlopeOne_error
0,9efd917f18bb5c966953b4227ddbaf43,TRIPLE_PLAY,0.304039,TRIPLE_PLAY,0.132850,TRIPLE_PLAY,0.018362
1,2240d11b0d9ee8f6d9e8972c6190aa42,SINGLE_PLAY,0.006179,SINGLE_PLAY,0.157817,SINGLE_PLAY,0.316222
2,4ec311c23978ae3c219aeaff96c5c4ff,DOUBLE_PLAY_PV,0.124137,DOUBLE_PLAY_PV,0.245129,DOUBLE_PLAY_PV,0.198264
3,be817326aeeab3a40aae1e5d04ef7b9c,DOUBLE_PLAY_BV,0.346747,DOUBLE_PLAY_BV,0.088402,DOUBLE_PLAY_BV,0.024346
4,d46578e724259a3706db0c910a934581,DOUBLE_PLAY_BV,0.148047,DOUBLE_PLAY_BV,0.110299,DOUBLE_PLAY_BV,0.174354
...,...,...,...,...,...,...,...
8514,00478c7a0ea164b3e5bc89d86e5fa5bb,DOUBLE_PLAY_BV,0.250729,DOUBLE_PLAY_BV,0.007617,DOUBLE_PLAY_BV,0.071672
8515,9a040b37d7c557e0f4f7fea33382af0b,DOUBLE_PLAY_BV,0.336425,DOUBLE_PLAY_BV,0.078079,DOUBLE_PLAY_BV,0.014024
8516,bf06f38320e11843132d13c87566d5d4,TRIPLE_PLAY,0.267606,TRIPLE_PLAY,0.169283,TRIPLE_PLAY,0.054795
8517,fefbb242be644eefcc2c5f2053d017f2,DOUBLE_PLAY_PV,0.111174,DOUBLE_PLAY_PV,0.258092,DOUBLE_PLAY_PV,0.211227


In [113]:
data_triplet.to_csv("data/up-selling-scored/CF_scored_and_eval.csv")