In [57]:
# import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

In [77]:
user_profile = pd.read_csv("data/User_Profile_Null_Handled.csv")
user_packages = pd.read_csv("data/PEO_TV_Usage_CF.csv")
product_profile = pd.read_csv("data/Product_Profile_Finalised_CSV.csv")
account_no_map = pd.read_csv("data/account_no_map_all.csv")

In [78]:
try:
    user_profile.drop(["Unnamed: 0"],axis=1,inplace=True)
    user_packages.drop(["Unnamed: 0"],axis=1,inplace=True)
    account_no_map.drop(["Unnamed: 0"],axis=1,inplace=True)
    product_profile.drop(["Unnamed: 0"],axis=1,inplace=True)
except:
    pass

In [79]:
user_profile = user_profile[(user_profile["Peo_TV_Package"] != "NO_INFO")]
user_profile.reset_index(drop=True, inplace=True)

In [80]:
rat_df = user_packages.merge(account_no_map, how="left", on="event_source.hash")
rat_df.rename(columns={"Peo_TV_Package":"package","rating":"ratings"}, inplace=True)
rat_df = rat_df.merge(user_profile, how="left", on="ACCOUNT_NUM.hash")
rat_df.dropna(subset=["ACCOUNT_NUM.hash","package"], inplace=True)

## User - package ratings

PCA Ratings Calculation

In [84]:
data_dim=rat_df.iloc[:,[28,77,80]]
data_dim.fillna(0, inplace=True)

### Run PCA on the data and reduce the dimensions in pca_num_components dimensions
pca = PCA(n_components=1)
pca.fit(data_dim)
reduced_data = pca.fit_transform(data_dim)
results_df = pd.DataFrame(reduced_data,columns=['pca_ratings'])

scaler = MinMaxScaler()
results_df['pca_ratings'] = scaler.fit_transform(results_df['pca_ratings'].values.reshape(-1,1))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


Service Usage based Ratings

In [85]:
rating_df=rat_df.iloc[:,[5,1,2]]
rating_df=pd.concat([rating_df,results_df],axis=1)
rating_df.rename(columns={'usage':'ratings'},inplace=True)
rating_df.dropna(subset=["ACCOUNT_NUM.hash","package","ratings"], inplace = True)
rating_df.drop_duplicates(subset=["ACCOUNT_NUM.hash","package","ratings"], inplace = True)

In [86]:
rating_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 179895 entries, 0 to 208857
Data columns (total 4 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   ACCOUNT_NUM.hash  179894 non-null  object 
 1   ratings           179894 non-null  float64
 2   package           179894 non-null  object 
 3   pca_ratings       159415 non-null  float64
dtypes: float64(2), object(2)
memory usage: 6.9+ MB


In [87]:
rating_df.to_csv("data/up-selling/peotv/peoTV_user_ratings.csv")

## User content

In [40]:
user_df=user_profile.iloc[:,[0,2,4,6,7,9,10,12,13,15,16,17,19,20,22,28,29,30,32,33,41,44,48,53,57,58,61,69,71,72,74,76,78,80,82,84]]

### Handeling Null Values

In [41]:
# Only for derived cols
# cat_cols = ['Tamil_Customer', 'Having_Insurance', "Peo_Extra_Channels_Purchased_Categorical"]
cat_cols = user_df.select_dtypes(include='object').columns
user_df[cat_cols] = user_df[cat_cols].fillna("NO_INFO")

#voice_udf[scaled_cols] = voice_udf[scaled_cols].fillna(0)

In [42]:
zero_cols = ["BB_Scaled","Peo_Extra_Channels_Purchased_Scaled","IDD_INCOMING_Scaled","IDD_OUTGOING_Scaled"]
user_df[zero_cols] = user_df[zero_cols].fillna(0)

In [43]:
mean_cols = ["OFFNET_INCOMING_Scaled","OFFNET_OUTGOING_Scaled","ONNET_INCOMING_Scaled","ONNET_OUTGOING_Scaled"]
for col in mean_cols:
    user_df[col] = user_df[col].fillna(user_df[col].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_df[col] = user_df[col].fillna(user_df[col].mean())


In [44]:
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1142 entries, 0 to 1141
Data columns (total 36 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   ACCOUNT_NUM.hash                     1142 non-null   object 
 1   OFFNET_INCOMING_Scaled               1142 non-null   float64
 2   OFFNET_INCOMING_Outlier              1142 non-null   object 
 3   OFFNET_OUTGOING_Scaled               1142 non-null   float64
 4   OFFNET_OUTGOING_Outlier              1142 non-null   object 
 5   ONNET_INCOMING_Scaled                1142 non-null   float64
 6   ONNET_INCOMING_Outlier               1142 non-null   object 
 7   ONNET_OUTGOING_Scaled                1142 non-null   float64
 8   ONNET_OUTGOING_Outlier               1142 non-null   object 
 9   IDD_INCOMING_Scaled                  1142 non-null   float64
 10  IDD_INCOMING_Outlier                 1142 non-null   object 
 11  IDD_incoming_country          

In [45]:
user_df.to_csv("data/azure/peoTV_user_content.csv")

## Product (package) content

In [47]:
product_df=product_profile.iloc[:,[0,1,2,3,5,6,14,15,23,24,25,26,27,28,29,30,31,32,33,34,35,37,38,39]]
product_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 263 entries, 0 to 262
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Product_ID                  263 non-null    object 
 1   Base_Type                   241 non-null    object 
 2   Pricing_Type                134 non-null    object 
 3   Package_Type                134 non-null    object 
 4   Title                       134 non-null    object 
 5   Description                 47 non-null     object 
 6   Price                       133 non-null    object 
 7   Monthly_Rental              141 non-null    float64
 8   PEOTV_No_of_Channels        8 non-null      float64
 9   PEOTV_Foreign_Channels      8 non-null      float64
 10  PEOTV_Local_Channels        8 non-null      float64
 11  PEOTV_Movie_Channels        8 non-null      float64
 12  PEOTV_Kids_Channels         8 non-null      float64
 13  PEOTV_Religious_Channels    8 non-n

In [48]:
temp_package_df=rating_df["package"]

In [49]:
peo_tv_products_df = product_df.merge(temp_package_df.drop_duplicates(), left_on=['Product_ID'], right_on=['package'] ,
                   how='left', indicator=True)

In [50]:
product_df = peo_tv_products_df[(peo_tv_products_df["_merge"]=="both")]

In [51]:
product_df.drop(["package","_merge"],axis=1,inplace=True)
product_df.rename(columns={'Product_ID':'package'},inplace=True)
product_df.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [52]:
product_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   package                     19 non-null     object 
 1   Base_Type                   19 non-null     object 
 2   Pricing_Type                19 non-null     object 
 3   Package_Type                19 non-null     object 
 4   Title                       19 non-null     object 
 5   Description                 19 non-null     object 
 6   Price                       19 non-null     object 
 7   Monthly_Rental              8 non-null      float64
 8   PEOTV_No_of_Channels        8 non-null      float64
 9   PEOTV_Foreign_Channels      8 non-null      float64
 10  PEOTV_Local_Channels        8 non-null      float64
 11  PEOTV_Movie_Channels        8 non-null      float64
 12  PEOTV_Kids_Channels         8 non-null      float64
 13  PEOTV_Religious_Channels    8 non-nul

In [53]:
cols =["PEOTV_Foreign_Channels","PEOTV_Local_Channels","PEOTV_Movie_Channels","PEOTV_Kids_Channels","PEOTV_Religious_Channels","PEOTV_Tamil_Channels","PEOTV_Educational_Channels","PEOTV_Music_Channels","PEOTV_Sports_Channels","PEOTV_News_Channels"]

In [54]:
product_df[cols] = product_df[cols].fillna(0)

In [55]:
product_df[["PEOTV_No_of_Channels","Monthly_Rental"]] = product_df[["PEOTV_No_of_Channels","Monthly_Rental"]].fillna(0)

In [56]:
scaler = MinMaxScaler()
for col in cols:
    product_df[col] = scaler.fit_transform(product_df[col].values.reshape(-1,1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  product_df[col] = scaler.fit_transform(product_df[col].values.reshape(-1,1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  product_df[col] = scaler.fit_transform(product_df[col].values.reshape(-1,1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  product_df[col] = scaler.fit_transform(product_df[c

In [60]:
product_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   package                     19 non-null     object 
 1   Base_Type                   19 non-null     object 
 2   Pricing_Type                19 non-null     object 
 3   Package_Type                19 non-null     object 
 4   Title                       19 non-null     object 
 5   Description                 19 non-null     object 
 6   Price                       19 non-null     object 
 7   Monthly_Rental              19 non-null     float64
 8   PEOTV_No_of_Channels        19 non-null     float64
 9   PEOTV_Foreign_Channels      19 non-null     float64
 10  PEOTV_Local_Channels        19 non-null     float64
 11  PEOTV_Movie_Channels        19 non-null     float64
 12  PEOTV_Kids_Channels         19 non-null     float64
 13  PEOTV_Religious_Channels    19 non-nu

In [58]:
product_df.to_csv("data/azure/peoTV_package_content.csv")