In [3]:
# Import Packages & Data(Corpus)
import pandas as pd
df = pd.read_csv("Movies_Dataset.csv")

In [4]:
df.head()

Unnamed: 0,id,title,overview
0,0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,1,Jumanji,When siblings Judy and Peter discover an encha...
2,2,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,4,Father of the Bride Part II,Just when George Banks has recovered from his ...


In [5]:
# Explore Data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        45466 non-null  int64 
 1   title     45460 non-null  object
 2   overview  44507 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


In [6]:
# Data preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [7]:
documents = df['overview'].values.astype("U")

In [8]:
vectorizer = TfidfVectorizer(stop_words='english')
features = vectorizer.fit_transform(documents)

In [9]:
# Running Kmeans
k = 10
model = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
model.fit(features)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
       n_clusters=10, n_init=1, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [10]:
# Cluster Analysis
df['cluster'] = model.labels_

In [11]:
df.head()

Unnamed: 0,id,title,overview,cluster
0,0,Toy Story,"Led by Woody, Andy's toys live happily in his ...",1
1,1,Jumanji,When siblings Judy and Peter discover an encha...,1
2,2,Grumpier Old Men,A family wedding reignites the ancient feud be...,3
3,3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",9
4,4,Father of the Bride Part II,Just when George Banks has recovered from his ...,1


In [12]:
# Output the result to a text file.

clusters = df.groupby('cluster')    

for cluster in clusters.groups:
    f = open('cluster'+str(cluster)+ '.csv', 'w') # create csv file
    data = clusters.get_group(cluster)[['title','overview']] # get title and overview columns
    f.write(data.to_csv(index_label='id')) # set index to id
    f.close()

In [41]:
print("Cluster centroids: \n")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

for i in range(k):
    print("Cluster %d:" % i)
    for j in order_centroids[i, :10]: #print out 10 feature terms of each cluster
        print (' %s' % terms[j])
    print('------------')

Cluster centroids: 

Cluster 0:
 police
 murder
 killer
 detective
 serial
 cop
 case
 officer
 crime
 young
------------
Cluster 1:
 life
 new
 world
 young
 story
 time
 group
 overview
 years
 friends
------------
Cluster 2:
 film
 documentary
 story
 director
 life
 directed
 feature
 based
 movie
 world
------------
Cluster 3:
 family
 life
 father
 home
 mother
 son
 new
 young
 old
 man
------------
Cluster 4:
 love
 woman
 young
 falls
 life
 story
 husband
 girl
 fall
 new
------------
Cluster 5:
 nan
 ݣ1890
 frazier
 fraw
 fray
 frayed
 fraying
 frayn
 frazer
 frazzled
------------
Cluster 6:
 war
 world
 ii
 civil
 american
 soldiers
 story
 german
 army
 soldier
------------
Cluster 7:
 school
 old
 year
 high
 girl
 life
 new
 teacher
 boy
 students
------------
Cluster 8:
 town
 small
 new
 local
 sheriff
 young
 life
 girl
 man
 family
------------
Cluster 9:
 man
 young
 life
 woman
 wife
 story
 finds
 old
 father
 new
------------
