# A Basic Introduction to Topic Modeling with Python 

Let's import the Pandas Data Analysis library to help us organize the text in the dataset.

In [1]:
import pandas as pd

Let's import and initialize the Natural Language Toolkit libraries we need for this task, together with the Regular Expressions library.

In [2]:
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
import re

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

### Preparing the dataset

Let's import the Harry Potter dataset from Kaggle. (You can install Kaggle on your system using [these](https://www.kaggle.com/docs/api) instructions.) The dataset contains separate .csv files for each Harry Potter film. 

In [4]:
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()
api.dataset_download_files(
    'kornflex/harry-potter-movies-dataset',
    unzip=True)

The file named "movies.csv" contains information about the movies. I want to extract the list of the eight movie titles.

In [5]:
movies_info = pd.read_csv("datasets/movies.csv", sep=";")
print(movies_info)

                                         movie  released_year  running_time  \
0     Harry Potter and the Philosopher's Stone           2001           152   
1      Harry Potter and the Chamber of Secrets           2002           161   
2     Harry Potter and the Prisoner of Azkaban           2004           142   
3          Harry Potter and the Goblet of Fire           2005           157   
4    Harry Potter and the Order of the Phoenix           2007           138   
5       Harry Potter and the Half-Blood Prince           2009           153   
6  Harry Potter and the Deathly Hallows Part 1           2010           146   
7  Harry Potter and the Deathly Hallows Part 2           2011           130   

      budget  box_office     file  
0  125000000  1002000000  hp1.csv  
1  100000000   880300000  hp2.csv  
2  130000000   796700000  hp3.csv  
3  150000000   896400000  hp4.csv  
4  150000000   942000000  hp5.csv  
5  250000000   943200000  hp6.csv  
6  200000000   976900000  hp7.csv  


In [6]:
movie_titles = movies_info.movie.tolist()
print(movie_titles)

["Harry Potter and the Philosopher's Stone", 'Harry Potter and the Chamber of Secrets', 'Harry Potter and the Prisoner of Azkaban', 'Harry Potter and the Goblet of Fire', 'Harry Potter and the Order of the Phoenix', 'Harry Potter and the Half-Blood Prince', 'Harry Potter and the Deathly Hallows Part 1', 'Harry Potter and the Deathly Hallows Part 2']


Now I will create a list of lists containing the dialogue for each movie.

In [7]:
all_scripts = []
for i in range(1,9):
    all_scripts.append(" ".join(pd.read_csv(f"datasets/hp{str(i)}.csv").fillna("").dialog.tolist()))

### Preprocessing the texts

In [8]:
stopwords = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [9]:
def clean_text(text):
    return re.sub(r"[\W]+", " ", text)

def lemmatize_text(text):
    tag_dictionary = {'NN':wordnet.NOUN,'JJ':wordnet.ADJ,'VB':wordnet.VERB,'RB':wordnet.ADV}
    lemmatized_text = ""
    for word, tag in pos_tag(word_tokenize(text)):
        if word in stopwords:
            pass
        elif tag[:2] in tag_dictionary:
            lemmatized_text += str(lemmatizer.lemmatize(word, tag_dictionary[tag[:2]])) + " "
        else:
            lemmatized_text += str(lemmatizer.lemmatize(word)) + " "
    return lemmatized_text

def preprocess_text(text):
    return lemmatize_text(clean_text(text))

In [10]:
preprocessed_scripts = [preprocess_text(script) for script in all_scripts]

## Topic Modeling with Bag-of-words Model

In [11]:
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(preprocessed_scripts)
feature_names = vectorizer.get_feature_names()
df_bow = pd.DataFrame(counts.T.todense(),
                      index=feature_names,
                      columns=movie_titles)

In [12]:
print(df_bow.iloc[:,0].sort_values(ascending=False).head(10))

go       102
harry     95
know      73
you       72
it        72
get       68
oh        54
see       51
one       49
what      48
Name: Harry Potter and the Philosopher's Stone, dtype: int64


In [28]:
def print_top_words(df):
    for title in movie_titles:
        top_words = df.loc[:, title].sort_values(ascending=False).index.tolist()[:5]
        top_scores = df.loc[:, title].sort_values(ascending=False).values.tolist()[:5]
        print("- " + title + ":",
              "\n",
             [f"{top_words[i]} ({round(top_scores[i], 2)})" for i in range(5)],
             "\n")

In [29]:
print("# Main topics in Harry Potter scripts using Bag-of-word model #\n")
print_top_words(df_bow)

# Main topics in Harry Potter scripts using Bag-of-word model #

- Harry Potter and the Philosopher's Stone: 
 ['go (102)', 'harry (95)', 'know (73)', 'you (72)', 'it (72)'] 

- Harry Potter and the Chamber of Secrets: 
 ['harry (106)', 'you (96)', 'go (88)', 'it (68)', 'get (66)'] 

- Harry Potter and the Prisoner of Azkaban: 
 ['harry (95)', 'you (93)', 'come (87)', 'it (71)', 'go (64)'] 

- Harry Potter and the Goblet of Fire: 
 ['harry (80)', 'you (68)', 'know (58)', 'the (50)', 'what (45)'] 

- Harry Potter and the Order of the Phoenix: 
 ['harry (123)', 'you (104)', 'potter (74)', 'go (73)', 'get (70)'] 

- Harry Potter and the Half-Blood Prince: 
 ['you (109)', 'know (102)', 'it (82)', 'harry (80)', 'one (74)'] 

- Harry Potter and the Deathly Hallows Part 1: 
 ['it (96)', 'you (94)', 'know (94)', 'harry (78)', 'think (67)'] 

- Harry Potter and the Deathly Hallows Part 2: 
 ['you (85)', 'it (60)', 'harry (55)', 'know (46)', 'but (36)'] 



## Topic modeling with Bag-of-words and Tf-idf scores

In [30]:
transformer = TfidfTransformer(norm=None,
                              smooth_idf=True,
                              use_idf=True,
                              sublinear_tf=True)
tfidf_scores_transformed = transformer.fit_transform(counts)
df_tf_idf = pd.DataFrame(tfidf_scores_transformed.T.todense(),
                        index=feature_names,
                        columns=movie_titles)

In [31]:
print("# Main topics in Harry Potter scripts using Bag-of-word model and TFIDF #\n")
print_top_words(df_tf_idf)

# Main topics in Harry Potter scripts using Bag-of-word model and TFIDF #

- Harry Potter and the Philosopher's Stone: 
 ['flamel (8.51)', 'fluffy (8.27)', 'whoa (7.78)', 'philosopher (7.71)', 'quirrell (7.71)'] 

- Harry Potter and the Chamber of Secrets: 
 ['dobby (9.28)', 'heir (8.73)', 'ho (8.01)', 'petrified (8.01)', 'hee (8.01)'] 

- Harry Potter and the Prisoner of Azkaban: 
 ['pettigrew (9.11)', 'buckbeak (8.28)', 'peter (8.01)', 'riddikulus (7.71)', 'boggart (7.71)'] 

- Harry Potter and the Goblet of Fire: 
 ['champion (9.74)', 'krum (9.29)', 'tri (8.73)', 'barty (8.01)', 'cedric (7.78)'] 

- Harry Potter and the Order of the Phoenix: 
 ['prophesy (9.74)', 'mysteries (9.45)', 'stupify (8.01)', 'umbridge (7.64)', 'kreacher (7.31)'] 

- Harry Potter and the Half-Blood Prince: 
 ['slughorn (9.6)', 'horace (8.93)', 'cormac (8.01)', 'cabinet (7.71)', 'vow (7.38)'] 

- Harry Potter and the Deathly Hallows Part 1: 
 ['bathilda (8.27)', 'bagshot (8.01)', 'kreacher (7.64)', 'dobby (7.