In [1]:
# import
from __future__ import print_function

import numpy as np
import pandas as pd
import collections
from mpl_toolkits.mplot3d import Axes3D
from IPython import display
from matplotlib import pyplot as plt
import sklearn
import sklearn.manifold
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
tf.logging.set_verbosity(tf.logging.ERROR)

Instructions for updating:
non-resource variables are not supported in the long term


In [2]:
pd.options.display.max_rows = 15
pd.options.display.float_format = '{:.3f}'.format

In [3]:
def mask(df, key, function): # returns a filtered dataframe, by applying function to key
    return(df[function(df[key])])

In [4]:
def flatten_cols(df):
    df.columns = [' '.join(col).strip() for col in df.columns.values]
    return df

In [5]:
pd.DataFrame.mask = mask
pd.DataFrame.flatten_cols = flatten_cols

In [6]:
# install Altair and activate its colab renderer
print('Installing Altair...')
import altair as alt
alt.data_transformers.enable('default', max_rows=None)
alt.renderers.enable('colab')
print('Done installing Altair.')

Installing Altair...
Done installing Altair.


In [8]:
# Install spreadsheets and import authentication module.
USER_RATINGS = False
# !pip install --upgrade -q gspread
# !pip install google.colab
# from google.colab import auth
# import gspread
# from oauth2client.client import GoogleCredentials

In [9]:
print("Downloading movielens data...")
from urllib.request import urlretrieve
import zipfile

urlretrieve("http://files.grouplens.org/datasets/movielens/ml-100k.zip", "movielens.zip")
zip_ref = zipfile.ZipFile('movielens.zip', 'r')
zip_ref.extractall()
print("Done, Dataset contains")
print(zip_ref.read('ml-100k/u.info'))

Downloading movielens data...
Done, Dataset contains
b'943 users\n1682 items\n100000 ratings\n'


In [10]:
# Load each data set(users, movies, and ratings)
users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=users_cols, encoding='latin-1')

In [11]:
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [12]:
ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', delimiter='\t', names=ratings_cols, encoding='latin-1')

In [13]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [14]:
# The movie file contains a binary feature for each genre
genre_cols = [
    "genre_unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
    "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]
movie_cols = [
    'movie_id', 'title', 'release_date', "video_release_date", "imdb_url" 
] + genre_cols

In [15]:
movies = pd.read_csv('ml-100k/u.item', sep='|', names=movie_cols, encoding='latin-1')
movies.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,genre_unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [16]:
# Since the ids start at 1, we shift them to start at 0
users['user_id'] = users['user_id'].apply(lambda x: str(x-1))

In [17]:
movies["movie_id"] = movies["movie_id"].apply(lambda x: str(x-1))
movies['year'] = movies['release_date'].apply(lambda x: str(x).split('-')[-1])

In [18]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype
---  ------          --------------   -----
 0   user_id         100000 non-null  int64
 1   movie_id        100000 non-null  int64
 2   rating          100000 non-null  int64
 3   unix_timestamp  100000 non-null  int64
dtypes: int64(4)
memory usage: 3.1 MB


In [19]:
ratings['movie_id'] = ratings['movie_id'].apply(lambda x: str(x-1))
ratings['user_id'] = ratings['user_id'].apply(lambda x:str(x-1))
ratings['rating'] = ratings['rating'].apply(lambda x:float(x))

In [20]:
# Compute the number of movies to which a genre is assigned
genre_occurences = movies[genre_cols].sum().to_dict()

In [21]:
movies[genre_cols].sum()

genre_unknown      2
Action           251
Adventure        135
Animation         42
Children         122
                ... 
Romance          247
Sci-Fi           101
Thriller         251
War               71
Western           27
Length: 19, dtype: int64

In [22]:
type(movies[genre_cols].sum())

pandas.core.series.Series

In [23]:
genre_occurences

{'genre_unknown': 2,
 'Action': 251,
 'Adventure': 135,
 'Animation': 42,
 'Children': 122,
 'Comedy': 505,
 'Crime': 109,
 'Documentary': 50,
 'Drama': 725,
 'Fantasy': 22,
 'Film-Noir': 24,
 'Horror': 92,
 'Musical': 56,
 'Mystery': 61,
 'Romance': 247,
 'Sci-Fi': 101,
 'Thriller': 251,
 'War': 71,
 'Western': 27}

In [24]:
# Since some movies can belong to more than one genre, we create different
# 'genre' columns as follows:
# - all_genres: all the active genres of the movie.
# - genre: randomly sampled from the active genres.

In [28]:
def mark_genres(movies, genres):
    def get_random_genre(gs):
        active = [genre for genre, g in zip(genres, gs) if g==1]
        if len(active) ==0 :
            return 'Other'
        return np.random.choice(active)
    def get_all_genres(gs):
        active = [genre for genre, g in zip(genres, gs) if g ==1]
        if len(active) == 0:
            return 'Other'
        return '-'.join(active)
    movies['genre'] = [
        get_random_genre(gs) for gs in zip(*[movies[genre] for genre in genres])]
    movies['all_genres'] = [
        get_all_genres(gs) for gs in zip(*[movies[genre] for genre in genres])]
    
mark_genres(movies, genre_cols)

In [29]:
# Create one merged DataFrame containing all the movielens data.
movielens = ratings.merge(movies, on='movie_id').merge(users, on='user_id')

In [30]:
# Utility to split the data into training and test sets.
def split_dataframe(df, holdout_fraction=0.1):
    test = df.sample(frac=holdout_fraction, replace=False)
    train = df[~df.index.isin(test.index)]
    return train, test

In [32]:
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,0,24,M,technician,85711
1,1,53,F,other,94043
2,2,23,M,writer,32067
3,3,24,M,technician,43537
4,4,33,F,other,15213


In [None]:
a= 10

In [None]:
np.random.randragne(1,6)

In [None]:
del dd
dd = [' '.join(col).strip() for col in ['an dd','bh hh','cf     www','ds','e']]
dd

In [None]:
' '.join('an')

In [None]:
' '.join(['an', 'bm'])