# Importing future models 
- To avoid incompitablity problems we import the four future modules below
- importing all the other required packages

In [2]:
# from __future__ import absolute_import
# from __future__ import division
# from __future__ import print_function
# from __future__ import unicode_literals

import math as mt
import time  
import numpy as np
import pandas as pd

%matplotlib inline
import seaborn as sns
import matplotlib, copy
from matplotlib import pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sparsesvd import sparsesvd        #used for matrix factorization
from scipy.sparse import csc_matrix    #used for sparse matrix

## To find out the time requried for running a model  created a function 

In [3]:
def run_time(method):
    
    '''defined a function run_time
    for finding the time taken to 
    execute the function'''
    
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        if 'log_time' in kw:
            name = kw.get('log_name', method.__name__.upper())
            kw['log_time'][name] = int((te - ts) * 1000)
        else:
            print('%r  %2.2f ms' % \
                  (method.__name__, (te - ts) * 1000))
        return result
    return timed


In [4]:
@run_time
def load_dataset(filepath):
    
    '''defined a function load_dataset
    for loading the csv file & converting utc 
    to time stamp'''
    
    dataset = pd.read_csv(filepath, encoding = 'UTF-8' )
    dataset['utc'] = pd.to_datetime(dataset['utc'], unit='s')
    return dataset

df = load_dataset('./data/reddit_data.csv')


'load_dataset'  16965.20 ms


## we look into the data that we have & what are the features we have

In [5]:
df.head()

Unnamed: 0,username,subreddit,utc
0,kabanossi,photoshopbattles,2016-12-26 10:24:59
1,kabanossi,GetMotivated,2016-12-26 10:23:14
2,kabanossi,vmware,2016-12-26 10:21:16
3,kabanossi,carporn,2016-12-26 10:20:18
4,kabanossi,DIY,2016-12-26 10:17:59


In [6]:
unique_user_entries = df.username.nunique()
unique_subreddits = df.subreddit.nunique()

print('Number of unique Users: '+ str(unique_user_entries))
print('Number of unique subreddit: '+str(unique_subreddits))


Number of unique Users: 22610
Number of unique subreddit: 34967


## top user frequency & subreddit frquency
- creating the list top 10 most visited subreddits for baseline recommendation


In [7]:
df.describe(include=['O']).T

Unnamed: 0,count,unique,top,freq
username,14000000,22610,Benphyre,1000
subreddit,14000000,34967,AskReddit,1030290


In [8]:
subreddit_grp = df.groupby(['subreddit'])['username'].nunique()
data = pd.DataFrame([(subreddit,count) for subreddit,count in subreddit_grp.iteritems()],
                    columns=["subreddit","no_of_users"])


In [9]:
top_subreddits = data.sort_values(by='no_of_users',ascending=False)
top_subreddits.reset_index(drop=True, inplace=True)

top_subreddits.drop('no_of_users', axis=1, inplace=True)

In [10]:
top_subreddits.head()

Unnamed: 0,subreddit
0,AskReddit
1,pics
2,funny
3,todayilearned
4,worldnews


## Built a baseline model
- recommending top subreddits irrespective of the users interests

In [11]:
def top_recommendations(user_name, recommendations):    
    recommendations['user_name'] = user_name
    return recommendations.head(10)

In [12]:
top_recommendations('kabanossi', top_subreddits)

Unnamed: 0,subreddit,user_name
0,AskReddit,kabanossi
1,pics,kabanossi
2,funny,kabanossi
3,todayilearned,kabanossi
4,worldnews,kabanossi
5,gaming,kabanossi
6,videos,kabanossi
7,news,kabanossi
8,gifs,kabanossi
9,movies,kabanossi


## building a model with user specfic recommendation
- converting the data into sparse matrix using tfidfvectorizer
- here the data is scaled and values are normalized


In [13]:
df.subreddit = ' ' + df.subreddit
total_subreddits = df.groupby('username')['subreddit'].sum()

tf_transformer = TfidfVectorizer(use_idf=False).fit(total_subreddits)
tf_vector = tf_transformer.transform(total_subreddits)

In [14]:
df_1 = pd.DataFrame(tf_vector.toarray(), index = total_subreddits.index, columns=tf_transformer.get_feature_names())

In [15]:
df_1.head()

Unnamed: 0_level_0,007,065_082_071,0ad,0x10c,0x3642,1000degreeknife,1000thworldproblems,100daycomicchallenge,100daysofketo,100kin,...,zumba,zune,zurich,zwave,zweiteliga,zwift,zxspectrum,zyramains,zyzz,zzt
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--ANUSTART-,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--Sko--,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--UNKN0WN--,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--harley--quinn--,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-A-p-r-i-l-,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
matrix = csc_matrix(df_1)

In [17]:
mat_df = pd.DataFrame(matrix.toarray())

In [18]:
mat_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,34958,34959,34960,34961,34962,34963,34964,34965,34966,34967
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
def SVD(mat_df, K):
    U, s, Vt = sparsesvd(mat_df, K)

    dimension = (len(s), len(s))
    S = np.zeros(dimension, dtype=np.float32)
    for val in range(0, len(s)):
        S[val,val] = mt.sqrt(s[val])

    U = csc_matrix(np.transpose(U), dtype=np.float32)
    S = csc_matrix(S, dtype=np.float32)
    Vt = csc_matrix(Vt, dtype=np.float32)
    
    return U, S, Vt

In [20]:
k=150
U, S, Vt = SVD(matrix, k) 

In [21]:
#Compute estimated rating for the test user
def EstimatedRecommendation(U, S, Vt, user_test):
    Term = S*Vt 

    estimatedRecommends = np.zeros(shape=(22610, 34967), dtype=np.float16)
    for user in user_Test:
        prod = U[user, :]*Term
        
        estimatedRecommends[user, :] = prod.todense()
        recommmend = (-estimatedRecommends[user, :]).argsort()[:150]
    return recommend

In [22]:
# user_Test = [np.where(users == 'kabanossi')[0][0]]
user_Test = df_1.iloc[100]
print ('username of the user is:', user_Test)

('username of the user is:', 007                     0.0
065_082_071             0.0
0ad                     0.0
0x10c                   0.0
0x3642                  0.0
1000degreeknife         0.0
1000thworldproblems     0.0
100daycomicchallenge    0.0
100daysofketo           0.0
100kin                  0.0
100pushups              0.0
100sexiest              0.0
100yearsago             0.0
101wicca                0.0
10202016                0.0
1022                    0.0
1098thworldproblems     0.0
10cloverfieldlane       0.0
10mm                    0.0
10sound                 0.0
112263hulu              0.0
1123581321345589        0.0
11bx1371                0.0
11foot8                 0.0
1200isjerky             0.0
1200isplenty            0.0
1200isplentyketo        0.0
123moviesto             0.0
125r                    0.0
12am                    0.0
                       ... 
zoo                     0.0
zooeydeschanel          0.0
zookeeperbattle         0.0
zoology            

In [23]:
recommended = EstimatedRecommendation(U, S, Vt, user_Test)

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices