In [1]:
!pip install pyspark[sql] tqdm fastparquet

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark[sql]
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 43 kB/s 
Collecting fastparquet
  Downloading fastparquet-2022.12.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 48.0 MB/s 
Collecting cramjam>=2.3
  Downloading cramjam-2.6.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 53.3 MB/s 
[?25hCollecting pandas>=1.5.0
  Downloading pandas-1.5.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.2 MB)
[K     |████████████████████████████████| 12.2 MB 39.6 MB/s 
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 62.6 MB/s 
Building wheels for collected packages: pyspark
  Building wheel for pysp

In [2]:
DRIVE_PATH = '/content/drive/MyDrive/data/extracted_parquet'

In [3]:
import numpy as np
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql import types as T
from pyspark.sql import functions as F

from sklearn.metrics.pairwise import cosine_similarity

spark = SparkSession \
    .builder \
    .master('local[*]') \
    .appName("mocommender") \
    .getOrCreate()

In [None]:
metadata = spark.read.parquet(DRIVE_PATH + '/metadata/*.parquet')

## MOVIE-GENRE MATRIX

In [4]:
movie_genre_df = spark.read.parquet(DRIVE_PATH + '/movie_genre/part-00000-caa98a46-cc9b-43ff-b8b7-f3761771e8d7-c000.snappy.parquet')
genre_df = spark.read.parquet(DRIVE_PATH + '/genre/part-00000-efc1790a-d5e2-4f82-9698-7cba71443f15-c000.snappy.parquet')

In [5]:
"""
2     [1, 3, 5]
3     [1, 2]
4     [8]


  |  1     2     3     5     8   
__|______________________________. . .
2 |  1     0     1     1     0   
--|------------------------------. . .
3 |  1     1     0     0     0   
--|------------------------------. . .
4 |  0     0     0     0     1   
  .
  .
  .

"""

df1 = movie_genre_df.select('id') \
                    .withColumnRenamed('id', 'movie_id')

df2 = genre_df.select('id') \
              .withColumnRenamed('id', 'genre_id')

M_df1_df2 = df1.crossJoin(df2) \
               .withColumn("w", F.lit(0))

M_df1_df2.show(4)

+--------+--------+---+
|movie_id|genre_id|  w|
+--------+--------+---+
|       2|      12|  0|
|       2|      14|  0|
|       2|      16|  0|
|       2|      18|  0|
+--------+--------+---+
only showing top 4 rows



In [6]:
exploded_movie_genre_df = movie_genre_df.select(F.col('id'),
                                                F.explode(F.col('genres'))) \
                                        .withColumnRenamed("id", "movie_id") \
                                        .withColumnRenamed("col", "genre_id")

In [7]:
M_movie_genre = M_df1_df2.join(exploded_movie_genre_df,
                               on=['movie_id', 'genre_id'],
                               how="full")\
                         .orderBy('movie_id', 'genre_id') \
                         .withColumn('w', 
                                     F.when(M_df1_df2.genre_id == exploded_movie_genre_df.genre_id, 1) \
                                      .otherwise(0))

In [8]:
M = M_movie_genre.groupby("movie_id") \
                 .pivot("genre_id") \
                 .sum("w") \
                 .fillna(0) \
                 .orderBy('movie_id')

In [9]:
movie_ids = M.select('movie_id').rdd.flatMap(lambda x: x).collect()
M_5000 = M.drop('movie_id')

In [10]:
dff = M_5000.toPandas()
#dff.to_parquet(DRIVE_PATH + '/matrices/dff', engine="fastparquet")

In [12]:
cosine_matrix = cosine_similarity(dff)
cosine_matrix_df = pd.DataFrame(cosine_matrix, movie_ids)

In [None]:
cosine_matrix_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28991,28992,28993,28994,28995,28996,28997,28998,28999,29000
2,1.000000,0.500000,0.500000,0.408248,0.000000,0.0,0.408248,0.707107,0.5,0.816497,...,0.00000,0.0,0.408248,0.353553,0.000000,0.000000,0.0,0.5,0.000000,0.000000
3,0.500000,1.000000,0.500000,0.000000,0.000000,0.0,0.816497,0.707107,0.5,0.408248,...,0.00000,0.0,0.000000,0.353553,0.000000,0.707107,0.0,0.5,0.000000,0.000000
5,0.500000,0.500000,1.000000,0.408248,0.000000,0.0,0.408248,0.000000,0.0,0.408248,...,0.00000,0.0,0.408248,0.707107,0.000000,0.707107,0.0,0.0,0.000000,0.000000
6,0.408248,0.000000,0.408248,1.000000,0.333333,0.0,0.000000,0.000000,0.0,0.333333,...,0.57735,0.0,1.000000,0.577350,0.000000,0.000000,0.0,0.0,0.333333,0.000000
11,0.000000,0.000000,0.000000,0.333333,1.000000,0.0,0.000000,0.000000,0.0,0.000000,...,0.57735,0.0,0.333333,0.288675,0.408248,0.000000,0.0,0.0,0.666667,0.408248
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
461634,0.000000,0.707107,0.707107,0.000000,0.000000,0.0,0.577350,0.000000,0.0,0.000000,...,0.00000,0.0,0.000000,0.500000,0.000000,1.000000,0.0,0.0,0.000000,0.000000
461805,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,...,0.00000,1.0,0.000000,0.000000,0.000000,0.000000,1.0,0.0,0.000000,0.000000
462788,0.500000,0.500000,0.000000,0.000000,0.000000,0.0,0.408248,0.707107,0.5,0.408248,...,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,1.0,0.000000,0.000000
463800,0.000000,0.000000,0.000000,0.333333,0.666667,0.0,0.000000,0.000000,0.0,0.000000,...,0.57735,0.0,0.333333,0.288675,0.408248,0.000000,0.0,0.0,1.000000,0.408248


In [13]:
cosine_matrix_df.index.name = 'movie_id'

In [14]:
cosine_matrix_df.index = cosine_matrix_df.index.map(str)

In [15]:
cosine_matrix_df.columns = cosine_matrix_df.index

In [None]:
del dff, cosine_matrix, M, M_movie_genre, exploded_movie_genre_df, M_df1_df2, df2, df1, genre_df, movie_genre_df

In [57]:
a = 1381

In [110]:
chunks = {}
chunks_index = []

for i in range(21):
  cur_chunk = f'chunk_{i}'
  start = i*a
  end = (i+1)*a

  #chunks[cur_chunk] = cosine_matrix_df[start:end]
  #chunks_index[cur_chunk] = movie_ids[start:end]
  columns = movie_ids[start:end]
  d = pd.DataFrame([[i]*len(columns)], columns=columns)
  chunks_index.append(d)

In [None]:
for k, v in chunks.items():
  v.to_parquet(DRIVE_PATH + f'/__OUTPUT__/movie_genre/movie_genre_{k}')
  del v

In [26]:
df_chunks_index = pd.DataFrame(chunks_index)
df_chunks_index.to_parquet(DRIVE_PATH + f'/__OUTPUT__/movie_genre/index/movie_genre_index')

In [131]:
d = pd.concat(chunks_index, axis=1)
d.columns = d.columns.map(str)
d.to_parquet(DRIVE_PATH + f'/__OUTPUT__/movie_genre/index/movie_genre_index')

In [None]:
del cosine_matrix_df, chunks, chunks_index

In [46]:
df_chunks_index

Unnamed: 0,chunk_0,chunk_1,chunk_2,chunk_3,chunk_4,chunk_5,chunk_6,chunk_7,chunk_8,chunk_9,...,chunk_11,chunk_12,chunk_13,chunk_14,chunk_15,chunk_16,chunk_17,chunk_18,chunk_19,chunk_20
0,2,2661,9298,11376,14475,18595,24008,28627,34181,40192,...,51828,62211,75638,88451,111149,143142,192868,250833,297806,356201
1,3,2662,9299,11377,14476,18598,24010,28628,34182,40205,...,51832,62213,75656,88491,111174,143146,192911,250895,297853,356216
2,5,2665,9300,11378,14484,18602,24012,28632,34184,40206,...,51836,62255,75674,88518,111188,143240,192936,250902,297859,356294
3,6,2666,9301,11379,14489,18612,24014,28635,34187,40208,...,51848,62297,75720,88527,111190,143355,193177,250919,297961,356296
4,11,2667,9302,11380,14499,18613,24016,28638,34193,40210,...,51851,62320,75733,88534,111237,143380,193216,250989,298026,356298
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1376,2642,9293,11371,14463,18587,24001,28601,34145,40162,44564,...,62185,75612,88377,111043,142979,192675,250658,297736,356156,461634
1377,2649,9294,11372,14464,18588,24002,28602,34148,40165,44566,...,62186,75622,88390,111083,143049,192695,250666,297755,356161,461805
1378,2652,9295,11373,14467,18589,24003,28605,34151,40168,44571,...,62188,75623,88395,111100,143073,192712,250700,297761,356189,462788
1379,2654,9296,11374,14469,18590,24004,28609,34152,40172,44578,...,62204,75629,88418,111109,143092,192767,250734,297762,356191,463800


In [None]:
d = pd.DataFrame(0, index=np.arange(len(data)), columns=feature_list)