In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder\
    .master("local[4]")\
    .appName("Tavsiye Sistemi")\
    .getOrCreate()

In [0]:
sc = spark.sparkContext

In [0]:
df_movie = spark.read.format('delta') \
    .options(header = 'True', inferschema = 'True')\
    .load("/user/hive/warehouse/movie", header = True)

In [0]:
df_movie.printSchema()

root
 |-- movieId: long (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [0]:
df_movie.show(10)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
+-------+--------------------+--------------------+
only showing top 10 rows



In [0]:
# File location and type
file_location = "/FileStore/tables/rating.csv"
file_type = "csv"

# CSV options
infer_schema = "false"
first_row_is_header = "true"
delimiter = ","

df_rating = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

In [0]:
df_rating.printSchema()

root
 |-- userId: string (nullable = true)
 |-- movieId: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- timestamp: string (nullable = true)



In [0]:
df_rating.show(10)

+------+-------+------+-------------------+
|userId|movieId|rating|          timestamp|
+------+-------+------+-------------------+
|     1|      2|   3.5|2005-04-02 23:53:47|
|     1|     29|   3.5|2005-04-02 23:31:16|
|     1|     32|   3.5|2005-04-02 23:33:39|
|     1|     47|   3.5|2005-04-02 23:32:07|
|     1|     50|   3.5|2005-04-02 23:29:40|
|     1|    112|   3.5|2004-09-10 03:09:00|
|     1|    151|     4|2004-09-10 03:08:54|
|     1|    223|     4|2005-04-02 23:46:13|
|     1|    253|     4|2005-04-02 23:35:40|
|     1|    260|     4|2005-04-02 23:33:46|
+------+-------+------+-------------------+
only showing top 10 rows



In [0]:
import pandas as pd

pandas_movie = df_movie.toPandas()
pandas_rating = df_rating.toPandas()
data = pd.concat([pandas_movie, pandas_rating], axis = 1)

In [0]:
data.head(10)

Unnamed: 0,movieId,title,genres,userId,movieId.1,rating,timestamp
0,1.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,2,3.5,2005-04-02 23:53:47
1,2.0,Jumanji (1995),Adventure|Children|Fantasy,1,29,3.5,2005-04-02 23:31:16
2,3.0,Grumpier Old Men (1995),Comedy|Romance,1,32,3.5,2005-04-02 23:33:39
3,4.0,Waiting to Exhale (1995),Comedy|Drama|Romance,1,47,3.5,2005-04-02 23:32:07
4,5.0,Father of the Bride Part II (1995),Comedy,1,50,3.5,2005-04-02 23:29:40
5,6.0,Heat (1995),Action|Crime|Thriller,1,112,3.5,2004-09-10 03:09:00
6,7.0,Sabrina (1995),Comedy|Romance,1,151,4.0,2004-09-10 03:08:54
7,8.0,Tom and Huck (1995),Adventure|Children,1,223,4.0,2005-04-02 23:46:13
8,9.0,Sudden Death (1995),Action,1,253,4.0,2005-04-02 23:35:40
9,10.0,GoldenEye (1995),Action|Adventure|Thriller,1,260,4.0,2005-04-02 23:33:46


In [0]:
data = data.iloc[:40000000,:]

In [0]:

data = data.drop(['movieId', 'genres', 'timestamp'], axis = 1)
data.head(10)

Unnamed: 0,title,userId,rating
0,Toy Story (1995),1,3.5
1,Jumanji (1995),1,3.5
2,Grumpier Old Men (1995),1,3.5
3,Waiting to Exhale (1995),1,3.5
4,Father of the Bride Part II (1995),1,3.5
5,Heat (1995),1,3.5
6,Sabrina (1995),1,4.0
7,Tom and Huck (1995),1,4.0
8,Sudden Death (1995),1,4.0
9,GoldenEye (1995),1,4.0


In [0]:
pivot_table = data.pivot_table(index = ["userId"], columns = ["title"], values = 'rating')
pivot_table.head(10)

title,"""11'09\""\""01 - September 11 (2002)""","""Diebuster \""\""Top wo Narae 2\""\"" (2004)""","""\""\""Great Performances\""\"" Cats (1998)""",#chicagoGirl: The Social Network Takes on a Dictator (2013),$ (Dollars) (1971),$5 a Day (2008),$9.99 (2008),$ellebrity (Sellebrity) (2012),'71 (2014),'Hellboy': The Seeds of Creation (2004),...,¡Qué hacer! (1970),¡Three Amigos! (1986),À l'aventure (2008),À nos amours (1983),À nous la liberté (Freedom for Us) (1931),À propos de Nice (1930),Árido Movie (2005),Åsa-Nisse - Wälkom to Knohult (2011),Üvegtigris (2001),貞子3D (2012)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,
100,,,,,,,,,,,...,,,,,,,,,,
101,,,,,,,,,,,...,,,,,,,,,,
102,,,,,,,,,,,...,,,,,,,,,,
103,,,,,,,,,,,...,,,,,,,,,,
104,,,3.0,,,,,,,,...,,,,5.0,,,,,,
105,,,,,,,,,,,...,,,,,,,,,,
106,,,,,,,,,,,...,,,,,,,,,,
107,,,,,,,,,,,...,,,,,,,,,,


In [0]:
movie_watched = pivot_table["Bad Boys (1995)"]
similarity_with_other_movies = pivot_table.corrwith(movie_watched)
similarity_with_other_movies = similarity_with_other_movies.sort_values(ascending = False)
similarity_with_other_movies.head(10)

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


Out[39]: title
"11'09\"\"01 - September 11 (2002)"                           NaN
"Diebuster \"\"Top wo Narae 2\"\" (2004)"                     NaN
"\"\"Great Performances\"\" Cats (1998)"                      NaN
#chicagoGirl: The Social Network Takes on a Dictator (2013)   NaN
$ (Dollars) (1971)                                            NaN
$5 a Day (2008)                                               NaN
$9.99 (2008)                                                  NaN
$ellebrity (Sellebrity) (2012)                                NaN
'71 (2014)                                                    NaN
'Hellboy': The Seeds of Creation (2004)                       NaN
dtype: float64