In [4]:
from google.cloud import storage
import os
from io import BytesIO
import pandas as pd


In [5]:
#Connect to GCP bucket and assign the bucket_name and specify the file name
bucket_name = "yh160" #Assign the bucket name where your file is stored
storage_client = storage.Client()

bucket = storage_client.get_bucket(bucket_name)

In [6]:
blob = storage.blob.Blob("ratings_small.csv",bucket)
blob

<Blob: yh160, ratings_small.csv, None>

In [7]:
# Convert to a pandas dataframe
content = blob.download_as_string()
train = pd.read_csv(BytesIO(content))

In [8]:
train

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [9]:
# impport everything we need for the rec sys
from pyspark.sql.types import *
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.sql import SparkSession

In [10]:
# initialize spark session
spark = SparkSession.builder.appName('Recommendation_system').getOrCreate()

In [11]:
# create a spark dataframe from a pandas dataframe
df = spark.createDataFrame(train)
df.show()

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
|     1|     70|   3.0|964982400|
|     1|    101|   5.0|964980868|
|     1|    110|   4.0|964982176|
|     1|    151|   5.0|964984041|
|     1|    157|   5.0|964984100|
|     1|    163|   5.0|964983650|
|     1|    216|   5.0|964981208|
|     1|    223|   3.0|964980985|
|     1|    231|   5.0|964981179|
|     1|    235|   4.0|964980908|
|     1|    260|   5.0|964981680|
|     1|    296|   3.0|964982967|
|     1|    316|   3.0|964982310|
|     1|    333|   5.0|964981179|
|     1|    349|   4.0|964982563|
+------+-------+------+---------+
only showing top 20 rows



In [12]:
# create testing and training data
(training, test) = df.randomSplit([0.8, 0.2])

In [13]:
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",coldStartStrategy="drop")

In [14]:
model = als.fit(training)

In [15]:
# generate predictions and evaluations
predictions = model.transform(test)

In [16]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",predictionCol="prediction")

In [17]:
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.0696373811755613


In [18]:
#recommendations
userRecs = model.recommendForAllUsers(10) # top 10 movie recommendations for each user
movieRecs = model.recommendForAllItems(10) # top 10 user recommendations for each movie
users = df.select(als.getUserCol()).distinct().limit(3) 
userSubsetRecs = model.recommendForUserSubset(users, 10) # top 10 movie recommendations for a specified set of users
movies = df.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 10) # top 10 user recommendations for a specified set of movies.

SyntaxError: invalid syntax (<ipython-input-180-4841044b6cfd>, line 3)

In [115]:
type(userRecs)

pyspark.sql.dataframe.DataFrame

In [91]:
#Connect to GCP bucket and assign the bucket_name and specify the file name
bucket_name = "yh160" #Assign the bucket name where your file is stored
storage_client = storage.Client()

bucket = storage_client.get_bucket(bucket_name)

In [92]:
blob = storage.blob.Blob("movies_metadata_small.csv",bucket)
blob

<Blob: yh160, movies_metadata_small.csv, None>

In [93]:
# Convert to a pandas dataframe
content = blob.download_as_string()
train = pd.read_csv(BytesIO(content))

In [94]:
pop = pd.DataFrame(train, columns = ['id','popularity'])

In [95]:
pop.dropna()

Unnamed: 0,id,popularity
0,8844,17.015539
1,949,17.924927
2,710,14.686036
3,1408,7.284477
4,524,10.137389
...,...,...
2787,3104,2.302582
2788,64197,0.528657
2789,98604,0.803588
2790,5589,0.375001


In [175]:
pop_10 = pop.sort_values(by=['popularity'],ascending =False).head(10)

In [182]:
pop_exp = pop.sort_values(by=['popularity'],ascending =False)
pop_exp.to_csv('popularity.csv',index = False)

In [104]:
pop_list = pop_10['id'].tolist()

In [116]:
pop_list

[680, 155, 78, 550, 278, 13, 22, 11, 424, 238]

In [137]:
user=userRecs.collect()

In [186]:
userrec=pd.DataFrame(user)
userrec.to_csv('als.csv',index = False)

In [152]:
user[0][1][0]

Row(movieId=7099, rating=8.616948127746582)

In [154]:
data = []
for m in range(0,len(user)):
    movie1 = user[m][1][0]['movieId']
    movie2 = user[m][1][1]['movieId']
    movie3 = user[m][1][2]['movieId']
    movie4 = user[m][1][3]['movieId']
    movie5 = user[m][1][4]['movieId']
    movie6 = user[m][1][5]['movieId']
    movie7 = user[m][1][6]['movieId']
    movie8 = user[m][1][7]['movieId']
    movie9 = user[m][1][8]['movieId']
    movie10 = user[m][1][9]['movieId']
    data.append([movie1,movie2,movie3,movie4,movie5,movie6,movie7,movie8,movie9,movie10])
    

In [161]:
k = 0
for m in range(0,len(data)):
    if data[m]==pop_list:
        k+=1
print(k)
    

0
