## Imports

In [7]:
import pandas as pd
import numpy as np
import scipy as sp

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
from validate_sample_submission import validate_submission

## Load data

In [9]:
df_ratings = pd.read_csv('BookRatings.csv')
df_users = pd.read_csv('BooksUsers.csv')
df_books = pd.read_csv('BooksMetaInfo.csv')
df_test = pd.read_csv('test_users.csv')

In [10]:
test_users = df_test.values.ravel().tolist()

In [11]:
# Check amount of users in training data
user_id_ratings = df_ratings['User-ID'].unique().tolist()
print(len(user_id_ratings))

# Check users in test data
test_user_id_ratings = df_test['User-ID'].unique().tolist()
print(len(test_user_id_ratings))

# Check the test users NOT in training
unique_test_users = [a for a in test_user_id_ratings if a in user_id_ratings]
print(len(unique_test_users))

5719
589
489


## User data

In [5]:
def make_sparse_ratings(df_ratings):
   
    # create ratings matrix
    R = sp.sparse.csr_matrix(df_ratings.set_index(['User-ID', 'ISBN']).unstack().replace(np.nan, 0))
    return R

In [6]:
R = make_sparse_ratings(df_ratings)

<5719x47768 sparse matrix of type '<class 'numpy.float64'>'
	with 109209 stored elements in Compressed Sparse Row format>

In [17]:
training_users = df_ratings['User-ID'].unique().tolist()
df_users = df_users[df_users['User-ID'].isin(training_users)]

In [21]:
def process_user_data(df_users):
    
    # index
    df_users = df_users.set_index('User-ID')
    
    # vectorize location data
    vectorizer = TfidfVectorizer()
    user_profiles = vectorizer.fit_transform(df_users.Location)
    
    # join age information
    #user_profiles = user_profilessp.sparse.csr_matrix(df_users.Age)
    
    return user_profiles

In [22]:
user_profiles = process_user_data(df_users)

In [23]:
user_profiles

<5719x2791 sparse matrix of type '<class 'numpy.float64'>'
	with 19124 stored elements in Compressed Sparse Row format>

In [26]:
cosine_similarity(user_profiles).shape

(5719, 5719)

In [75]:
sp.sparse.csr_matrix(df_users.Age).reshape(-1,1)

<61568x1 sparse matrix of type '<class 'numpy.float64'>'
	with 61484 stored elements in COOrdinate format>

In [76]:
sp.sparse.hstack(user_profiles, sp.sparse.csr_matrix(df_users.Age).reshape(-1,1))

ValueError: blocks must be 2-D

In [62]:
df_users.Location

User-ID
2               stockton, california, usa
8                timmins, ontario, canada
9              germantown, tennessee, usa
10             albacete, wisconsin, spain
12            fort bragg, california, usa
                       ...               
278844           st. paul, minnesota, usa
278849        georgetown, ontario, canada
278851                 dallas, texas, usa
278852    brisbane, queensland, australia
278854              portland, oregon, usa
Name: Location, Length: 61568, dtype: object

['03770',
 '04005',
 '05680',
 '10012',
 '1110',
 '12',
 '15526',
 '16605',
 '2301',
 '23564',
 '24',
 '24229',
 '25474',
 '27243',
 '28355',
 '29170',
 '30',
 '302',
 '304',
 '305',
 '308',
 '321',
 '322',
 '330',
 '33157',
 '35037',
 '36',
 '378',
 '380',
 '40882',
 '41017',
 '4120',
 '41747',
 '4248',
 '45144',
 '4729',
 '47559',
 '47589',
 '4dn',
 '51381',
 '52062',
 '52066',
 '52080',
 '53',
 '53129',
 '60385',
 '61',
 '6130',
 '614',
 '6158',
 '626',
 '64',
 '65439',
 '69361',
 '71000',
 '717',
 '72189',
 '75',
 '76149',
 '7793',
 '78126',
 '8125',
 '8172',
 '8572',
 '85742',
 '8802',
 '89075',
 '915',
 '92346',
 'aaaaaaa',
 'aachen',
 'aaf',
 'aalborg',
 'aan',
 'aarau',
 'aargau',
 'aarhus',
 'ab',
 'abbot',
 'abbotsford',
 'abbottstown',
 'abc',
 'abelokipi',
 'aberdare',
 'aberdeen',
 'aberdeenshire',
 'aberdour',
 'aberfoyle',
 'abertillery',
 'aberystwyth',
 'abilene',
 'abiline',
 'abingdon',
 'abington',
 'abiquiu',
 'abita',
 'aboyne',
 'abq',
 'abruzzo',
 'abu',
 'acapu

In [53]:
df_users = process_user_data(df_users)
df_users

Unnamed: 0_level_0,Age,city,state,country
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,18.0,stockton,california,usa
8,,timmins,ontario,canada
9,,germantown,tennessee,usa
10,26.0,albacete,wisconsin,spain
12,,fort bragg,california,usa
...,...,...,...,...
278844,28.0,st. paul,minnesota,usa
278849,23.0,georgetown,ontario,canada
278851,33.0,dallas,texas,usa
278852,32.0,brisbane,queensland,australia


In [35]:
splits = df_users.Location.str.split(',')
df_users['city'] = [row[0] for row in splits]

In [17]:
df_users = df_users.set_index('User-ID')

KeyError: "None of ['User-ID'] are in the columns"

In [33]:
df_users.Location.str.split(',')

User-ID
2               [stockton,  california,  usa]
8                [timmins,  ontario,  canada]
9              [germantown,  tennessee,  usa]
10             [albacete,  wisconsin,  spain]
12            [fort bragg,  california,  usa]
                         ...                 
278844           [st. paul,  minnesota,  usa]
278849        [georgetown,  ontario,  canada]
278851                 [dallas,  texas,  usa]
278852    [brisbane,  queensland,  australia]
278854              [portland,  oregon,  usa]
Name: Location, Length: 61568, dtype: object

In [15]:
df_users[['city', 'state', 'country']] = df_users.Location.str.split(',').values

ValueError: Must have equal len keys and value when setting with an iterable