In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
pd.options.mode.chained_assignment = None

In [2]:
df = pd.read_table("tvs-data.tsv")
df = df[df["genres"]!="\\N"]

In [3]:
df

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0032557,tvSeries,The Green Archer,The Green Archer,0,1940,\N,285,"Action,Adventure,Crime"
2,tt0035803,tvSeries,The German Weekly Review,Die Deutsche Wochenschau,0,1940,1945,12,"Documentary,News"
3,tt0038276,tvSeries,You Are an Artist,You Are an Artist,0,1946,1955,15,Talk-Show
4,tt0039120,tvSeries,Americana,Americana,0,1947,1949,30,"Family,Game-Show"
5,tt0039121,tvSeries,Birthday Party,Birthday Party,0,1947,1949,30,Family
...,...,...,...,...,...,...,...,...,...
208165,tt9916206,tvSeries,Nojor,Nojor,0,2019,\N,20,Fantasy
208167,tt9916216,tvSeries,Kalyanam Mudhal Kadhal Varai,Kalyanam Mudhal Kadhal Varai,0,2014,2017,22,Romance
208168,tt9916218,tvSeries,Lost in Food,Lost in Food,0,2016,2017,\N,Talk-Show
208169,tt9916380,tvSeries,Meie aasta Aafrikas,Meie aasta Aafrikas,0,2019,\N,43,"Adventure,Comedy,Family"


In [4]:
df = df[['primaryTitle','genres']]

In [5]:
df['genres'] = df['genres'].map(lambda x : x.lower().split(','))

In [6]:
df.set_index('primaryTitle',inplace = True)
df.head()

Unnamed: 0_level_0,genres
primaryTitle,Unnamed: 1_level_1
The Green Archer,"[action, adventure, crime]"
The German Weekly Review,"[documentary, news]"
You Are an Artist,[talk-show]
Americana,"[family, game-show]"
Birthday Party,[family]


In [7]:
df['genre list'] = ''
columns = df.columns
for index, row in df.iterrows():
    words = ''
    for col in columns:
        words = words + ' '.join(row[col])+ ' '
    row['genre list'] = words

In [8]:
df.drop(columns = [col for col in df.columns if col!= 'genre list'], inplace = True)

In [9]:
df.reset_index()

Unnamed: 0,primaryTitle,genre list
0,The Green Archer,action adventure crime
1,The German Weekly Review,documentary news
2,You Are an Artist,talk-show
3,Americana,family game-show
4,Birthday Party,family
...,...,...
190018,Nojor,fantasy
190019,Kalyanam Mudhal Kadhal Varai,romance
190020,Lost in Food,talk-show
190021,Meie aasta Aafrikas,adventure comedy family


In [10]:
df = df.iloc[0:20000, :]

In [11]:
count = CountVectorizer()
count_matrix = count.fit_transform(df['genre list'])

In [12]:
count_matrix

<20000x32 sparse matrix of type '<class 'numpy.int64'>'
	with 36618 stored elements in Compressed Sparse Row format>

In [13]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [14]:
cosine_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.70710678],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.70710678, 0.        ,
        1.        ]])

In [15]:
cosine_sim.shape

(20000, 20000)

In [16]:
data = cosine_sim

In [17]:
df2 = pd.DataFrame(data)

In [19]:
df2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19990,19991,19992,19993,19994,19995,19996,19997,19998,19999
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,...,0.0,0.408248,0.333333,0.0,0.0,1.0,0.333333,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.408248,0.0,0.0,0.0,0.0,0.0,0.0,...,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.408248,1.0,0.57735,0.0,0.0,0.0,0.0,0.333333,...,0.816497,0.408248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.408248
4,0.0,0.0,0.0,0.57735,1.0,0.0,0.0,0.0,0.0,0.57735,...,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107
