In [1]:
import pandas as pd

df_idf=pd.read_csv("movies_merged.csv",sep='|')
df_idf = df_idf.dropna()
df_idf = df_idf.applymap(str.lower)

In [2]:
df_idf['Text']

0        too few films take on the art of arguing as a ...
1                     the film leaves a tremendous impact.
3        from 1957 and first-time director sidney lumet...
4        mechanically written, but within its own middl...
5                          a strangely realistic thriller.
6        this movie is a masterpiece. that term gets th...
7        in the 60 years since its release, sidney lume...
8        with each new viewing i come away feelings as ...
9        lumet in his first film records it as a meticu...
10       ...the film has aged surprisingly well in the ...
11       there is real value is how it allows each memb...
12       the film takes a confined, almost completely b...
13       the cast is incredible, the writing superb, an...
14       this was sidney lumet's first movie and it's a...
15                           an incisive and gripping film
16       the explosive qualities and historical importa...
17       lumet keeps things tense, sweaty, suspenseful .

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import re

def get_stop_words(stop_file_path):
    """load stop words """
    
    with open(stop_file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return frozenset(stop_set)

#load a set of stop words
stopwords=get_stop_words("stopwords.txt")

#get the text column 
docs=df_idf['Text'].tolist()

#create a vocabulary of words, 
#ignore words that appear in 85% of documents, 
#eliminate stop words
cv=CountVectorizer(max_df=0.85,stop_words=stopwords)
word_count_vector=cv.fit_transform(docs)

  'stop_words.' % sorted(inconsistent))


In [4]:
word_count_vector.shape

(62008, 35025)

In [5]:
cv=CountVectorizer(max_df=0.85,stop_words=stopwords,max_features=10000)
word_count_vector=cv.fit_transform(docs)
word_count_vector.shape

(62008, 10000)

In [6]:
list(cv.vocabulary_.keys())[:10]

['films',
 'take',
 'art',
 'arguing',
 'subject',
 'certainly',
 'lumet',
 'window',
 'strained',
 'civic']

We can also get the vocabulary by using `get_feature_names()`

In [7]:
list(cv.get_feature_names())[2000:2015]

['creaky',
 'cream',
 'create',
 'created',
 'creates',
 'creating',
 'creation',
 'creations',
 'creative',
 'creatively',
 'creativity',
 'creator',
 'creators',
 'creature',
 'creatures']

In [8]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

Let's look at some of the IDF values:

In [9]:
tfidf_transformer.idf_

array([9.32698461, 6.89923638, 8.02770163, ..., 7.410062  , 8.56929891,
       8.0837911 ])

In [10]:
# get test docs into a list
docs_test=df_idf['Text'].tolist()

## Computing TF-IDF and Extracting Keywords

In [11]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:
        fname = feature_names[idx]
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [13]:
# you only needs to do this once
feature_names=cv.get_feature_names()

# get the document that we want to extract keywords from
doc=docs_test[0]

#generate tf-idf for the given document
tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))

#sort the tf-idf vectors by descending order of scores
sorted_items=sort_coo(tf_idf_vector.tocoo())

#extract only the top n; n here is 10
keywords=extract_topn_from_vector(feature_names,sorted_items,10)

# now print the results
print("\n===Keywords===")
for k in keywords:
    print(k,keywords[k])


===Keywords===
civic 0.332
arguing 0.329
mightily 0.318
strained 0.314
lumet 0.314
duty 0.304
window 0.276
serve 0.275
continue 0.254
certainly 0.206


In [16]:
# put the common code into several methods
def get_keywords(idx):

    #generate tf-idf for the given document
    tf_idf_vector=tfidf_transformer.transform(cv.transform([docs_test[idx]]))

    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)
    
    return keywords

def print_results(idx,keywords):
    # now print the results
    print("\n===Keywords===")
    for k in keywords:
        print(k,keywords[k])



Now let's look at keywords generated for a much longer question: 


In [17]:
idx=120
keywords=get_keywords(idx)
print_results(idx,keywords)


===Keywords===
mankiewicz 0.409
joseph 0.393
eve 0.384
qualities 0.348
positive 0.341
list 0.323
amazing 0.276
top 0.26
performances 0.213
