In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
pd.options.mode.chained_assignment = None

In [3]:
df = pd.read_table("tvs-data.tsv",usecols=['primaryTitle','genres'])
df = df[df["genres"]!="\\N"]

In [4]:
df

Unnamed: 0,primaryTitle,genres
0,The Green Archer,"Action,Adventure,Crime"
2,The German Weekly Review,"Documentary,News"
3,You Are an Artist,Talk-Show
4,Americana,"Family,Game-Show"
5,Birthday Party,Family
...,...,...
208165,Nojor,Fantasy
208167,Kalyanam Mudhal Kadhal Varai,Romance
208168,Lost in Food,Talk-Show
208169,Meie aasta Aafrikas,"Adventure,Comedy,Family"


In [5]:
df['genres'] = df['genres'].map(lambda x : x.lower().split(','))

In [6]:
df = df.iloc[0:40000, :]

In [7]:
df.set_index('primaryTitle',inplace = True)
df.head()

Unnamed: 0_level_0,genres
primaryTitle,Unnamed: 1_level_1
The Green Archer,"[action, adventure, crime]"
The German Weekly Review,"[documentary, news]"
You Are an Artist,[talk-show]
Americana,"[family, game-show]"
Birthday Party,[family]


In [8]:
df['genre list'] = ''
columns = df.columns
for index, row in df.iterrows():
    words = ''
    for col in columns:
        words = words + ' '.join(row[col])+ ' '
    row['genre list'] = words

In [9]:
df

Unnamed: 0_level_0,genres,genre list
primaryTitle,Unnamed: 1_level_1,Unnamed: 2_level_1
The Green Archer,"[action, adventure, crime]",action adventure crime
The German Weekly Review,"[documentary, news]",documentary news
You Are an Artist,[talk-show],talk-show
Americana,"[family, game-show]",family game-show
Birthday Party,[family],family
...,...,...
Amethyst,[family],family
The legacy of Adam,[animation],animation
PBC Face to Face,[sport],sport
PBC Countdown,[sport],sport


In [10]:
df.drop(columns = [col for col in df.columns if col!= 'genre list'], inplace = True)

In [11]:
df.reset_index()

Unnamed: 0,primaryTitle,genre list
0,The Green Archer,action adventure crime
1,The German Weekly Review,documentary news
2,You Are an Artist,talk-show
3,Americana,family game-show
4,Birthday Party,family
...,...,...
39995,Amethyst,family
39996,The legacy of Adam,animation
39997,PBC Face to Face,sport
39998,PBC Countdown,sport


In [11]:
count = CountVectorizer()
count_matrix = count.fit_transform(df['genre list'])

In [12]:
count_matrix

<20000x32 sparse matrix of type '<class 'numpy.int64'>'
	with 36618 stored elements in Compressed Sparse Row format>

In [13]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [14]:
cosine_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.70710678],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.70710678, 0.        ,
        1.        ]])

In [15]:
cosine_sim.shape

(20000, 20000)

In [16]:
data = cosine_sim

In [17]:
df2 = pd.DataFrame(data)

In [19]:
df2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19990,19991,19992,19993,19994,19995,19996,19997,19998,19999
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,...,0.0,0.408248,0.333333,0.0,0.0,1.0,0.333333,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.408248,0.0,0.0,0.0,0.0,0.0,0.0,...,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.408248,1.0,0.57735,0.0,0.0,0.0,0.0,0.333333,...,0.816497,0.408248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.408248
4,0.0,0.0,0.0,0.57735,1.0,0.0,0.0,0.0,0.0,0.57735,...,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107
