In [1]:
# Install and import packages into mlenv enviroment 
from sentence_transformers import SentenceTransformer, util
import pandas as pd

In [2]:

model = SentenceTransformer('all-MiniLM-L6-v2')


In [4]:
# Import cleaned data csvs
file_anime = '../Final Resources/cleaned_anime.csv'
file_la = '../Final Resources/cleaned_live_actions.csv'

In [5]:
# Create dfs
anime_df = pd.read_csv(file_anime, index_col=0) # add index_col=0 so that there aren't 2 index columns!
la_df = pd.read_csv(file_la, index_col=0)

In [6]:
# Make a list of anime descriptions from df
anime_desc = anime_df['synopsis'].tolist()

In [7]:
# Make a list of netflix, hulu, amazon prime, disney+ descriptions
la_desc = la_df['description'].tolist()

In [8]:
# make a list of the index of rows that are missing descriptions
strings = []
floats = []
for i in range(len(anime_desc)):
    if type(anime_desc[i]) == str:
        strings.append(i)
    else:
        floats.append(i)


len(floats) 

0

In [9]:
# make a list of the index of rows that are missing descriptions
string = []
floaters = []
for i in range(len(la_desc)):
    if type(la_desc[i]) == str:
        strings.append(i)
    else:
        floaters.append(i)


len(floaters) 

0

In [10]:
len(la_desc)

21784

In [11]:
len(anime_desc)

5111

In [12]:
# Encode descriptions from both lists
anime_embedding = model.encode(anime_desc) # Encoding the anime list

In [13]:
type(anime_embedding)

numpy.ndarray

In [14]:
column_series = pd.Series(anime_embedding)

ValueError: Data must be 1-dimensional

In [15]:
anime_embedding_df = pd.DataFrame(anime_embedding)
anime_embedding_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,-0.080896,0.081521,0.024639,-0.022273,-0.024698,-0.013718,0.044134,0.03028,-0.033405,0.055939,...,-0.055394,0.071931,-0.046239,-0.001261,-0.007824,0.006315,0.126777,-0.066777,-0.097394,0.070937
1,-0.073111,-0.014546,0.009811,0.016464,-0.024401,0.05763,0.045597,-0.05784,0.012522,-0.041396,...,0.02733,0.015306,-0.027152,0.042169,-0.085612,-0.011866,0.028435,-0.010502,-0.008784,0.040417
2,-0.069815,0.049462,0.002211,0.026143,0.0173,-0.049455,0.046423,0.010373,0.023637,-0.033681,...,0.055453,0.00542,0.010394,0.062388,-0.098413,0.097373,0.047905,0.014281,-0.004877,-0.025122
3,-0.116074,0.059125,-0.009722,-0.019543,-0.049341,-0.033883,0.051601,-0.032328,0.009772,-0.006919,...,-0.043402,-0.034193,-0.055788,-0.00617,-0.041296,0.014373,0.101554,0.008523,0.037134,-0.12319
4,-0.113074,0.056182,-0.065571,0.069393,-0.05986,0.009886,0.022183,-0.060608,0.021883,0.013101,...,0.013625,-0.016732,-0.080794,-0.033309,0.036567,0.0613,0.046308,0.000651,-0.012857,-0.04247


In [16]:
la_embedding = model.encode(la_desc) # Encoding the netflix list

In [17]:
# Add encodings as a column append for each list

### This block will return error: ###

#la_df['la_embeddings'] = la_embedding
#anime_df['anime_embeddings'] = anime_embedding
#anime_df.head()

Adding the embeddings for each description will not be very efficient: each embedding is ~400 numbers in a list. Pivot plan. Run comparisons, pull top 5 most simialar shows instead. 

In [18]:
practice_la_embed= la_embedding[0:20]
practice_anime_embed = anime_embedding[0:10]

In [19]:
# Compute cosine similarities
cos_sim = util.cos_sim(practice_la_embed, practice_anime_embed)
cos_sim


tensor([[ 0.0485,  0.2290,  0.2079,  0.1554,  0.2385,  0.1965,  0.1879,  0.2988,
          0.2498,  0.0855],
        [ 0.1681,  0.1254,  0.1798,  0.0575,  0.1273,  0.0204,  0.2033,  0.1580,
          0.2727,  0.1083],
        [ 0.2328,  0.1589,  0.2318,  0.2531,  0.1374,  0.2239,  0.1444,  0.3077,
          0.2273,  0.3492],
        [ 0.0453,  0.0412,  0.1235,  0.1116,  0.0797,  0.1347,  0.1202,  0.1185,
          0.1120,  0.1209],
        [ 0.2692,  0.1526,  0.1197, -0.0107,  0.1000,  0.2105,  0.1939,  0.1550,
          0.4049,  0.1125],
        [ 0.0446,  0.1250,  0.1993,  0.0797,  0.0887,  0.3739,  0.3397,  0.2987,
          0.2851,  0.1897],
        [ 0.1878,  0.1236,  0.2373,  0.1676,  0.1393,  0.1592,  0.3691,  0.1891,
          0.2991,  0.2184],
        [ 0.0329,  0.0933,  0.1938,  0.1096,  0.0907,  0.1464,  0.1703,  0.1733,
          0.1481,  0.1881],
        [ 0.1814,  0.0475,  0.0274,  0.0819, -0.0396,  0.0139,  0.0846,  0.0386,
          0.0354,  0.1515],
        [ 0.0070,  

In [20]:
type(cos_sim)

torch.Tensor

In [21]:
cos_sim[0]

tensor([0.0485, 0.2290, 0.2079, 0.1554, 0.2385, 0.1965, 0.1879, 0.2988, 0.2498,
        0.0855])

In [22]:
practice_la_titles = la_df["title"][0:20]

In [23]:
## This is just to visualize the tensor so I can check for loop below more easily :)
practice_anime_titles = anime_df["title"][0:10]
practice_la_titles = la_df["title"][0:20]
pract_cossim_df = pd.DataFrame(cos_sim).astype("float")
pract_cossim_df = pract_cossim_df.set_axis(practice_anime_titles, axis=1, inplace=False)
pract_cossim_df= pract_cossim_df.set_index(practice_la_titles, drop=False)
pract_cossim_df

title,Haikyuu!! Second Season,Shigatsu wa Kimi no Uso,Made in Abyss,Fullmetal Alchemist: Brotherhood,Kizumonogatari III: Reiketsu-hen,Mob Psycho 100 II,Sen to Chihiro no Kamikakushi,Kimetsu no Yaiba,Owarimonogatari 2nd Season,Code Geass: Hangyaku no Lelouch R2
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Dick Johnson Is Dead,0.048462,0.228976,0.207877,0.155418,0.238541,0.19655,0.187894,0.298825,0.249774,0.085515
Blood & Water,0.168082,0.125389,0.179788,0.057479,0.127291,0.020444,0.20326,0.157965,0.272744,0.108262
Ganglands,0.232796,0.158869,0.231843,0.253069,0.137395,0.223872,0.144429,0.307729,0.227324,0.349184
Jailbirds New Orleans,0.045336,0.041186,0.123505,0.111594,0.079721,0.134749,0.120208,0.118505,0.112013,0.120944
Kota Factory,0.269244,0.152552,0.119683,-0.010675,0.099996,0.210451,0.193913,0.154989,0.404905,0.112476
Midnight Mass,0.044584,0.125038,0.199332,0.079701,0.0887,0.373918,0.339699,0.298685,0.285089,0.189656
My Little Pony: A New Generation,0.187775,0.123587,0.237272,0.167637,0.139279,0.15921,0.369115,0.189125,0.299094,0.218383
Sankofa,0.032912,0.093252,0.193839,0.1096,0.090696,0.146372,0.170312,0.173284,0.14809,0.188143
The Great British Baking Show,0.181354,0.047517,0.02743,0.081923,-0.039601,0.013864,0.084615,0.038641,0.035353,0.151465
The Starling,0.007047,0.087087,0.156699,0.075254,0.114919,0.047746,0.281396,0.176999,0.108321,-0.013935


In [30]:
recs = []
# for each row in the cos_sim tensor
for i in range(len(cos_sim)):
    mydict={}

    # for each col in the row
    for x in range(len(cos_sim[i])):
        #assign title to the column from anime_df["title"][col#]
        mydict[anime_df["title"][x]] = cos_sim[i][x]

    # find the max value in all the columns
    top2 = sorted(mydict, key=mydict.get, reverse=True)[:2]
    recs.append(top2)

practice_la_df['recommended animes'] = recs

practice_la_df
    


KeyError: 0

In [25]:
practice_la_df = la_df[0:20]

In [26]:
print(cos_sim[0][0], anime_df["title"][0])
print(cos_sim[0][1], anime_df["title"][1])
print(cos_sim[0][2], anime_df["title"][2])

KeyError: 0

In [27]:
len(cos_sim)

20

In [28]:
len(cos_sim[0])

10

In [29]:
anime_df["title"][1]

uid
1    Cowboy Bebop
1    Cowboy Bebop
Name: title, dtype: object

In [None]:
# Make list from outputs; join anime title to encoding list index

# Make a for-loop to identify top 5 most similar anime  


In [None]:
# Append top 5 most similar amime to each movie as new column in movie df