# Inferring Topics from IMDB Reviews

In [3]:
import numpy as np
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import pandas as pd
import matplotlib.pyplot as plt

## Exploring the Dataset: [Large Movie Review Dataset](https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz)

In [4]:
ROOT = 'aclImdb/aclImdb/train/pos/'

In [5]:
reviews = []
for file in os.listdir(ROOT):
    path = os.path.join(ROOT, file)
    if os.path.isfile(path):
        with open(path, 'r') as fin:
            reviews.append(fin.read())

In [6]:
len(reviews)

12500

In [11]:
for i in range(5):
    print(reviews[i])
    print('=' * 150)

Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!
Homelessness (or Houselessness as George Carlin stated) has been an issue for years but never a plan to help those on the street that were once considered human who did everything from going to

## Feature Extraction

In [8]:
vect = TfidfVectorizer(stop_words='english')
X = vect.fit_transform(reviews)

pd.DataFrame(X.toarray(), columns=vect.get_feature_names_out())

Unnamed: 0,00,000,000s,003830,006,007,0079,0080,0083,0093638,...,élan,émigré,émigrés,était,état,étc,êxtase,ís,østbye,über
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## NMF Decomposition

In [12]:
N_TOPICS = 15
nmf = NMF(n_components=N_TOPICS)
W = nmf.fit_transform(X)  # Document-topic matrix
H = nmf.components_       # Topic-term matrix



In [15]:
# Top 10 words per topic

words = np.array(vect.get_feature_names())
topic_words = pd.DataFrame(np.zeros((N_TOPICS, 10)), index=[f'Topic {i + 1}' for i in range(N_TOPICS)],
                           columns=[f'Word {i + 1}' for i in range(10)]).astype(str)
for i in range(N_TOPICS):
    ix = H[i].argsort()[::-1][:10]
    topic_words.iloc[i] = words[ix]

topic_words

Unnamed: 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10
Topic 1,br,10,ll,spoilers,end,simply,yes,spoiler,quite,just
Topic 2,movie,movies,watch,recommend,10,seen,saw,best,actors,definitely
Topic 3,film,films,director,characters,seen,cinema,festival,work,scenes,art
Topic 4,series,episode,episodes,season,tv,characters,trek,seasons,shows,television
Topic 5,man,role,character,performance,best,plays,john,played,does,actor
Topic 6,good,pretty,story,bad,acting,really,job,liked,nice,little
Topic 7,war,world,documentary,people,american,history,soldiers,men,women,hitler
Topic 8,funny,comedy,laugh,hilarious,eddie,fun,jokes,humor,funniest,murphy
Topic 9,like,think,really,just,don,people,know,say,didn,lot
Topic 10,time,years,saw,seen,dvd,old,remember,ve,music,disney


In [13]:
# Create a topic mapping

topic_mapping = {
    'Topic 4': 'TV',
    'Topic 7': 'War',
    'Topic 8': 'Comedy',
    'Topic 12': 'Book Adaptation',
    'Topic 13': 'Horror',
    'Topic 15': 'Martial Arts / Action'
}

In [19]:
# Recall the document-topic matrix, W

W = pd.DataFrame(W, columns=[f'Topic {i + 1}' for i in range(N_TOPICS)])
W['max_topic'] = W.apply(lambda x: topic_mapping.get(x.idxmax()), axis=1)
W[pd.notnull(W['max_topic'])].head(10)

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,Topic 11,Topic 12,Topic 13,Topic 14,Topic 15,max_topic
18,0.03284,0.001606,0.029848,0.000523,0.01123,0.014157,0.0,0.0,0.008036,0.0,0.00352,0.042767,0.006794,0.0,0.0,Book Adaptation
23,0.0,0.0,0.0,0.0,0.018293,0.002263,0.0,0.0,0.001097,0.0,0.00541,0.0,0.042975,0.001779,0.0,Horror
24,0.026185,0.002999,0.008563,0.000693,0.00752,0.0,0.0,0.000347,0.0,0.033115,0.005137,0.0,0.06473,0.0,9e-05,Horror
33,0.02485,0.0,0.020555,0.002875,0.002399,0.002216,0.0,2e-06,0.011658,0.004302,0.0,0.007074,0.064364,0.001002,0.003901,Horror
44,0.000273,0.0,0.0,0.0,0.020231,0.003413,0.012703,0.0,0.002757,0.022486,0.020912,0.0,0.037059,0.0,0.001353,Horror
55,0.037855,0.003525,0.030094,0.014966,0.025385,0.017682,0.002882,0.0,0.002527,0.007348,0.007329,0.0,0.070148,0.002537,0.0,Horror
66,0.034383,0.014843,0.000254,3.9e-05,0.00244,0.013318,0.003707,0.000823,0.003222,0.0,0.0,0.057809,0.0,0.0,0.006336,Book Adaptation
70,6.9e-05,0.0,0.013443,0.0,0.00796,0.001259,0.005076,0.0,0.0,0.0,0.015355,0.001556,0.0,0.018036,0.021396,Martial Arts / Action
72,0.048072,0.006536,0.026512,0.0,0.0,0.0,0.067874,0.0,0.01371,0.017972,0.004302,0.0,0.0,0.0,0.001539,War
77,0.014749,0.0,0.035474,0.101275,0.0,0.00847,0.0,0.0,0.006707,0.0,0.0,0.078955,0.0,0.0,0.0,TV


In [21]:
reviews[23]

'The legendary Boris Karloff ended his illustrious career by making four cheapie fright flick clunkers in Mexico. This is the token moody period Gothic horror entry from the bunch. Karloff gives a typically spry and dignified performance as Matthias Morteval, an elderly eccentric patriarch who invites several of his petty, greedy and backbiting no-count relatives to his creepy rundown castle for the reading of a will. Pretty soon the hateful guests are getting bumped off by lethal life-sized toy people who populate the place. Onetime Mexican sex symbol Andres Garcia of "Tintorera" infamy portrays the dashing police officer hero and Julissa looks absolutely ravishing as the sole likable female character. The clunky, plodding (non)direction, trite by-the-numbers script, ugly, washed-out cinematography, ridiculous murder set pieces (a gross fat slob gets blasted right in the face by a miniature cannon!), overwrought string score, morbid gloom-doom atmosphere, largely lousy acting (Karloff