In [296]:
import pandas as pd
import numpy as np
from datetime import datetime
import time
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler

In [3]:
df = pd.read_csv('./Data/VideoStarted.csv')

In [4]:
def make_timestamp(row):
    date_as_string = str(row['Date'])
    hour = str(row['Minute_Of_Day'] // (60))
    minute = str(row['Minute_Of_Day'] % 60)
    second = str(row['Second'])
    
    if len(hour) == 1:
        hour = '0' + hour
        
    if len(minute) == 1:
        minute = '0' + minute
    
    if len(second) == 1:
        second = '0' + second
    
    date_string = date_as_string[:4] + '-' + date_as_string[4:6] + '-' + date_as_string[6:] + ' ' + hour + ':' + minute + ':' + second
    datetime_object = datetime.strptime(date_string, '%Y-%m-%d %H:%M:%S')
    return datetime_object

In [5]:
df['TimeStamp'] = df.apply(make_timestamp, axis=1)

In [6]:
def change_timestamp_to_epoch(row):
    pattern = '%Y-%m-%d %H:%M:%S'
    timestamp = str(row['TimeStamp'])
    epoch = int(time.mktime(time.strptime(timestamp, pattern)))
    return epoch

df['epoch'] = df.apply(change_timestamp_to_epoch,axis=1)

In [12]:
scaler = MinMaxScaler()
df['epoch_scaled'] = scaler.fit_transform(df[['epoch']])
df.head()

  return self.partial_fit(X, y)


Unnamed: 0,UserId,Date,Minute_Of_Day,Second,State,Genre,Category,ProgramType,Country,Device,OS,VideoId,TimeStamp,epoch,epoch_scaled
0,0280dfdd112732a3ac12b12dc770b7af,20170106,0,8,35,Romance,vod,Movies,1,1,2,36a27b379622f342ec87f9aafadb8f94,2017-01-06 00:00:08,1483641008,0.0
1,435d41ae019cb8db785483793859c9a8,20170106,0,28,35,Anime,vod,TV Shows,1,1,1,a4d9b88c7ed63d723c70b358a857719c,2017-01-06 00:00:28,1483641028,1e-06
2,1faf0ce0b98e02e1568702f516f01a78,20170106,0,36,35,Drama,vod,TV Shows,1,1,1,806660cb47633263a24bbc53238a9a53,2017-01-06 00:00:36,1483641036,2e-06
3,3ec691b9d2b5d53ef965fe59b1900b30,20170106,0,48,35,Drama,vod,TV Shows,1,2,2,385114825a85d6878e7a0978f9ba5546,2017-01-06 00:00:48,1483641048,3e-06
4,ff230d487a7139b65f33b54a4cbd2d9e,20170106,0,56,35,Comedy,vod,Movies,1,1,1,368a480ec0ae105aee8320dd93483e39,2017-01-06 00:00:56,1483641056,4e-06


In [13]:
df.to_csv('./Data/VideoStartedEpoch.csv')

In [14]:
df['epoch'].max()

1497292196

In [15]:
df['epoch_scaled'].max()

1.0

# Function to get data before epoch

In [176]:
def get_data_before_epoch(df, epoch):
    return df[df['epoch'] < epoch]

def get_data_after_epoch(df, epoch):
    return df[df['epoch'] >= epoch]

In [177]:
get_data_before_epoch(df, df['epoch'].quantile(.2)).shape

(251514, 16)

In [268]:
df = pd.read_csv('./Data/VideoStartedEpoch.csv')
df = df.sort_values('epoch')

EPOCH_SELECTED = df['epoch'].quantile(.7)

df_test = get_data_after_epoch(df, EPOCH_SELECTED)
df = get_data_before_epoch(df, EPOCH_SELECTED)
print(df.shape, df_test.shape)

(1257567, 16) (538958, 16)


# Function to get frequency value

In [269]:
def get_frequency_value(df):
    df_grouped_by_user_id = df.groupby('UserId').count()
    df_to_return = df_grouped_by_user_id[['Date']]
    df_to_return.columns=['frequency_value']
    return df_to_return

In [270]:
df_freq_value = get_frequency_value(df)

# Function to get frequency score

In [271]:
def get_frequency_score(row):
    frequency_value = row['frequency_value']
    frequency_score = 0
    
    if frequency_value > 22:
        frequency_score = 5
    elif frequency_value > 10 and frequency_value <= 22:
        frequency_score = 4
    elif frequency_value > 4 and frequency_value <= 10:
        frequency_score = 3
    elif frequency_value > 2 and frequency_value <= 4:
        frequency_score = 2
    elif frequency_value == 1 or frequency_value == 2:
        frequency_score = 1
        
    row['frequency_score'] = frequency_score
    return row

In [272]:
df_freq_score = df_freq_value.apply(get_frequency_score, axis=1)

In [273]:
df_freq_score.head()

Unnamed: 0_level_0,frequency_value,frequency_score
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1
0000146e97c32d369268e5ba5f4b907c,1,1
00004010d92c2f13b824237bc323d58b,2,1
0000e560394fce891dcf80218793d181,3,2
00015c69cccd32bcd323227b7140d8a8,10,3
00016f6ad820aa24940343f837799519,23,5


In [274]:
df_freq_score.to_csv('./RecencyFrequency.csv')

# Function to get recency value

In [275]:
def get_recency_value(df):
    df = df[['UserId', 'VideoId', 'epoch']]
    sorted_df = df.sort_values(['epoch'])
    df_grouped_by_user_id = sorted_df.groupby(['UserId']).tail(1)
    df_grouped_by_user_id = df_grouped_by_user_id.set_index('UserId')
    df_grouped_by_user_id = df_grouped_by_user_id[['epoch']]
    df_grouped_by_user_id.columns = ['recency_value']
    return df_grouped_by_user_id

In [276]:
df_recency_value = get_recency_value(df)

In [277]:
df_without_recency_score = pd.concat([df_freq_score, df_recency_value], axis=1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [278]:
df_without_recency_score.to_csv('./RecencyFrequency.csv')

In [279]:
dict(df_without_recency_score['recency_value'].describe())

{'count': 132879.0,
 'mean': 1491101960.6835995,
 'std': 3589362.4131665574,
 'min': 1483641008.0,
 '25%': 1488171622.0,
 '50%': 1492183956.0,
 '75%': 1494154318.0,
 'max': 1495441032.0}

# Function to get recency score

In [280]:
def get_recency_score(df):
    scaler = MinMaxScaler((1, 5))
    df['recency_score'] = scaler.fit_transform(df[['recency_value']])
#     df['recency_score'] = np.round(df['recency_score'])
    return df

In [281]:
df_recency_score = get_recency_score(df_recency_value)

  return self.partial_fit(X, y)


In [282]:
df_recency_frequency = pd.concat([df_freq_score, df_recency_score], axis=1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [351]:
df_recency_frequency.describe()

Unnamed: 0,frequency_value,frequency_score,recency_value,recency_score
count,132879.0,132879.0,132879.0,132879.0
mean,9.464001,2.318726,1491102000.0,3.529131
std,21.999365,1.361162,3589362.0,1.216731
min,1.0,1.0,1483641000.0,1.0
25%,2.0,1.0,1488172000.0,2.535798
50%,3.0,2.0,1492184000.0,3.895909
75%,9.0,3.0,1494154000.0,4.563827
max,1123.0,5.0,1495441000.0,5.0


In [284]:
df_recency_frequency.to_csv('./RecencyFrequency.csv')

In [285]:
df_segments = df_recency_frequency.copy()

# Segments
|Segment|Recency Score Range|Frequency Score Range|
|---|---|---|
|Champions|4-5|4-5|
|Loyal Customers|3-4|4-5|
|Potential Loyalists|4-5|2-3|
|Promising|3-4|1-2|
|Can't Lose Them|1-2|4-5|
|At Risk|1-2|3-4|
|Hibernating|1-2|1-2|
|New Customers|4-5|1-2|

In [286]:
def get_segments(row):
    recency_score = row['recency_score']
    frequency_score = row['frequency_score']
    
    segment = np.NaN
    
    if recency_score > 4 and recency_score <=5 and frequency_score >= 4 and frequency_score <=5:
        segment = "Champions"
    elif recency_score > 3 and recency_score <=4 and frequency_score >= 4 and frequency_score <=5:
        segment = "Loyal Customers"
    elif recency_score > 4 and recency_score <=5 and frequency_score >= 2 and frequency_score <=3:
        segment = "Potential Loyalists"
    elif recency_score > 3 and recency_score <=4 and frequency_score >= 1 and frequency_score <=2:
        segment = "Promising"
    elif recency_score > 1 and recency_score <=2 and frequency_score >= 4 and frequency_score <=5:
        segment = "Can't Lose Them"
    elif recency_score > 1 and recency_score <=2 and frequency_score >= 3 and frequency_score <=4:
        segment = "At Risk"
    elif recency_score > 1 and recency_score <=2 and frequency_score >= 1 and frequency_score <=2:
        segment = "Hibernating"
    elif recency_score > 4 and recency_score <=5 and frequency_score >= 1 and frequency_score <=2:
        segment = "New Customers"
        
    row['segment'] = segment
    
    return row

In [287]:
df_segments = df_recency_frequency.apply(get_segments, axis=1)

In [288]:
df_segments.groupby('segment').count()

Unnamed: 0_level_0,frequency_value,frequency_score,recency_value,recency_score
segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
At Risk,4578,4578,4578,4578
Can't Lose Them,2593,2593,2593,2593
Champions,18892,18892,18892,18892
Hibernating,16760,16760,16760,16760
Loyal Customers,4458,4458,4458,4458
New Customers,20775,20775,20775,20775
Potential Loyalists,22843,22843,22843,22843
Promising,18358,18358,18358,18358


In [392]:
MIN_VALUE = 4.5

df_users_predicted_will_watch = df_recency_frequency[
    (df_recency_frequency['recency_score'] >= 3.89) & 
    (df_recency_frequency['frequency_score'] >= 4)
]

In [393]:
df_users_predicted_will_watch.index

users_predicted_will_not_watch = list(set(df_recency_frequency.index).difference(set(df_users_predicted_will_watch.index)))


In [400]:
# def get_predicted_and_test_dfs(users_predicted_will_watch, user_predicted_will_not_watch,df_test):
    
#     minimum_epoch = df_test['epoch'].min()
#     maximum_epoch = minimum_epoch + (1000 * 60 * 60 * 24 * 2)
#     df = df_test.sort_values('epoch')
#     df = df[df['epoch'] < maximum_epoch]
    
#     users_actual = df['UserId'].unique()
    
#     user_predicted_and_test_dict = dict()
    
#     for user_id in users_predicted_will_watch:
#         dict_for_user = dict(pred = 1, test=0)
        
#         if user_id in users_actual:
#             dict_for_user['test'] = 1
            
#         user_predicted_and_test_dict[user_id] = dict_for_user
    
#     users_who_watched_but_we_didnt_predict = list(set(users_actual).difference(set(users_predicted_will_watch)))
    
# #     print(users_who_watched_but_we_didnt_predict)
    
#     for user_id in users_who_watched_but_we_didnt_predict:
#         user_predicted_and_test_dict[user_id] = dict(pred = 0, test=1)
        
# #     user_predicted_will_not_watch = list(set(user_predicted_will_not_watch).difference(set(users_actual)))
        
# #     for user_id in user_predicted_will_not_watch:
# #         user_predicted_and_test_dict[user_id] = dict(pred = 0, test=0)
    
#     return user_predicted_and_test_dict

In [403]:
def get_predicted_and_test_dfs(users_predicted_will_watch, user_predicted_will_not_watch,df_test):
    
    minimum_epoch = df_test['epoch'].min()
    maximum_epoch = minimum_epoch + (1000 * 60 * 60 * 24 * 2)
    df = df_test.sort_values('epoch')
    df = df[df['epoch'] < maximum_epoch]
    
    df_all = pd.read_csv('./Data/VideoStartedEpoch.csv')
    all_users = df_all['UserId'].unique()
    
    users_actual = df['UserId'].unique()
    
    true_positives = set(users_actual).intersection(set(users_predicted_will_watch))
    false_positives = set(users_predicted_will_watch).difference(true_positives)
    false_negatives = set(users_actual).difference(set(users_predicted_will_watch))
    true_negative = (set(all_users).difference(set(users_actual))).intersection(set(user_predicted_will_not_watch))
    
    true_positives = len(list(true_positives))
    false_positives = len(list(false_positives))
    false_negatives = len(list(false_negatives))
    true_negative = len(list(true_negative))
    
    precision = (true_positives)/ (true_positives + false_positives)
    recall = (true_positives) / (true_positives + false_negatives)
    f1 = 2 * precision * recall / (precision + recall)
    
    return precision, recall, f1

In [405]:
get_predicted_and_test_dfs(df_users_predicted_will_watch.index, users_predicted_will_not_watch,df_test)

(0.41435255823415723, 0.17732456904057112, 0.24836143035697952)

In [396]:
df_result = pd.DataFrame(user_predicted_and_test_dict).T
df_result.head()

Unnamed: 0,pred,test
00016f6ad820aa24940343f837799519,1,1
000b95c95aa093c5740ec34db21bf720,1,0
000d3745dc9d10720d23c0b3fd952990,1,1
000ea68b356465cba3fd2bce92adb19b,1,1
00120b2a358302b2605d48eb895d9f0e,1,0


In [397]:
df_result[
    (df_result['test'] == 1) &
    (df_result['pred'] == 1)
].shape

(8147, 2)

In [398]:
print(classification_report(df_result['test'], df_result['pred']))

              precision    recall  f1-score   support

           0       0.74      0.90      0.81    119149
           1       0.41      0.18      0.25     45944

   micro avg       0.70      0.70      0.70    165093
   macro avg       0.58      0.54      0.53    165093
weighted avg       0.65      0.70      0.66    165093



In [399]:
print(confusion_matrix(df_result['test'], df_result['pred']))

[[107634  11515]
 [ 37797   8147]]
