In [22]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
pd.options.mode.chained_assignment = None

In [23]:
df = pd.read_table("tvs-data.tsv",usecols=['primaryTitle','genres'])
df = df[df["genres"]!="\\N"]

In [24]:
df

Unnamed: 0,primaryTitle,genres
0,The Green Archer,"Action,Adventure,Crime"
2,The German Weekly Review,"Documentary,News"
3,You Are an Artist,Talk-Show
4,Americana,"Family,Game-Show"
5,Birthday Party,Family
...,...,...
208165,Nojor,Fantasy
208167,Kalyanam Mudhal Kadhal Varai,Romance
208168,Lost in Food,Talk-Show
208169,Meie aasta Aafrikas,"Adventure,Comedy,Family"


In [25]:
df['genres'] = df['genres'].map(lambda x : x.lower().replace(','," "))

In [26]:
df = df.iloc[0:20000, :]

In [27]:
count = CountVectorizer()
count_matrix = count.fit_transform(df['genres'])

In [28]:
count_matrix

<20000x32 sparse matrix of type '<class 'numpy.int64'>'
	with 36618 stored elements in Compressed Sparse Row format>

In [29]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [30]:
cosine_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.70710678],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.70710678, 0.        ,
        1.        ]])

In [31]:
cosine_sim.shape

(20000, 20000)

In [32]:
df2 = pd.DataFrame(cosine_sim)

In [33]:
df2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19990,19991,19992,19993,19994,19995,19996,19997,19998,19999
0,1.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.333333,0.00000,0.000000,...,0.000000,0.408248,0.333333,0.000000,0.000000,1.000000,0.333333,0.000000,0.00000,0.000000
1,0.000000,1.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
2,0.000000,0.0,1.000000,0.408248,0.000000,0.000000,0.00000,0.000000,0.00000,0.000000,...,0.500000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
3,0.000000,0.0,0.408248,1.000000,0.577350,0.000000,0.00000,0.000000,0.00000,0.333333,...,0.816497,0.408248,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.408248
4,0.000000,0.0,0.000000,0.577350,1.000000,0.000000,0.00000,0.000000,0.00000,0.577350,...,0.000000,0.707107,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.707107
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,1.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.333333,0.00000,0.000000,...,0.000000,0.408248,0.333333,0.000000,0.000000,1.000000,0.333333,0.000000,0.00000,0.000000
19996,0.333333,0.0,0.000000,0.000000,0.000000,0.000000,0.57735,0.333333,0.57735,0.000000,...,0.000000,0.000000,0.333333,0.000000,0.333333,0.333333,1.000000,0.000000,0.57735,0.000000
19997,0.000000,0.0,0.000000,0.000000,0.000000,0.707107,0.00000,0.000000,0.00000,0.577350,...,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,1.000000,0.00000,0.707107
19998,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,1.00000,0.577350,1.00000,0.000000,...,0.000000,0.000000,0.577350,0.000000,0.577350,0.000000,0.577350,0.000000,1.00000,0.000000


In [34]:
df.set_index('primaryTitle',inplace = True)
df.head()

Unnamed: 0_level_0,genres
primaryTitle,Unnamed: 1_level_1
The Green Archer,action adventure crime
The German Weekly Review,documentary news
You Are an Artist,talk-show
Americana,family game-show
Birthday Party,family


In [35]:
indices = pd.Series(df.index)

In [36]:
def recommendations(title, cosine_sim = cosine_sim):
    
    recomended_webseries = []
    
    idx = indices[indices == title].index[0]
    
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    
    top_10_indexes = list(score_series.iloc[2:12].index)
    
    for i in top_10_indexes:
        recomended_webseries.append(list(df.index)[i])
        
    return recomended_webseries

In [37]:
recommendations('Aquí está Pancho Villa')

['Otpisani',
 'Kapelski kresovi',
 'The Gallant Men',
 'Tour of Duty',
 'Code Name: Foxfire',
 'Baza na Dunavu',
 'The Residents of Washington Heights',
 'Black Sash',
 'Data Tutashkhia',
 'Salas u Malom Ritu']