In [1]:
# 设置 LSL/LDA 的主题数
nTopics = 10

# 散点图的最大点数
points = 1000

# 设置分析主题
subject = 'Star Wars EP4'

In [2]:
# 科学计算基础包
import numpy as np
# 数据处理及导入导出
import pandas as pd

# 数据可视化
import matplotlib.pyplot as plt
from matplotlib.cm import get_cmap
from matplotlib.colors import rgb2hex
# 更好的可视化效果
import seaborn as sns
# 创建交互式图形
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

# 矢量化文本
from sklearn.feature_extraction.text import CountVectorizer
# 分解文本
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import PCA
from sklearn.decomposition import SparsePCA
# 可视化高维数据集
from sklearn.manifold import TSNE

# 标记单词
from textblob import TextBlob

# 使用新的数据类型
from collections import Counter

# 停用词
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [3]:
# 导入数据
df = pd.read_table('data/SW_EpisodeIV.txt', delim_whitespace=True, header=0, escapechar='\\').rename(columns={'dialogue': 'text'}).dropna(subset=['text'], axis=0)

In [4]:
print('DataFrame Shape: {}'.format(df.shape))

DataFrame Shape: (1010, 2)


In [5]:
df.sample(5)

Unnamed: 0,character,text
633,LUKE,See-Threepio! See-Threepio!
601,HAN,Go get him!
494,HAN,Not for long...
550,LUKE,I didn't hear you give any ideas...
182,LUKE,He knew my father?


In [6]:
# 矢量化文本
countVectorizer = CountVectorizer(stop_words=stop)
vectorizedText = countVectorizer.fit_transform(df['text'].str.replace("'", '').values)
print('Shape Vectorized Text: {}'.format(vectorizedText.shape))

Shape Vectorized Text: (1010, 1686)


In [7]:
# 设置常见词数量
n = 20

In [8]:
def nMostFrequentWords(n, countVectorizer, vectorizedText): 
    """
        得出最常见的单词及其出现的次数
    
        Args:
            n: n most frequent words, int
            countVectorizer: CountVectorizer
            vectorizedText: vectorized text, string
            
        Returns:
            words: most frequent words, list
            wordCounts: count word appearences, list
    """
    # 计算单词在文本中出现的次数
    vectorizedCount = np.sum(vectorizedText, axis=0)
    
    # 获取单词索引和计数
    wordIndices = np.flip(np.argsort(vectorizedCount), 1)
    wordCounts = np.flip(np.sort(vectorizedCount),1)

    # 创建单词向量
    wordVectors = np.zeros((n, vectorizedText.shape[1]))
    for i in range(n):
        wordVectors[i, wordIndices[0,i]] = 1

    # 逆转换单词向量
    words = [word[0].encode('ascii').decode('utf-8') for word in countVectorizer.inverse_transform(wordVectors)]

    # 返回最常见的单词及其出现的次数
    return (words, wordCounts[0, :n].tolist()[0])

In [9]:
words, wordCounts = nMostFrequentWords(n=n, countVectorizer=countVectorizer, vectorizedText=vectorizedText)

In [10]:
# 创建色彩映射
cmap = get_cmap('viridis')
colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]

In [11]:
# 生成柱状图
data = go.Bar(x = words,
              y = wordCounts,
              marker = dict(color = colors))

layout = go.Layout(title = 'Most Frequent {} Words In {}'.format(n, subject),
                   xaxis = dict(title = 'Words'),
                   yaxis = dict(title = 'Count'))

fig = go.Figure(data=[data], layout=layout)
iplot(fig)

In [17]:
# 创建 LSI
lsiModel = TruncatedSVD(n_components=nTopics)
lsiTopicMatrix = lsiModel.fit_transform(vectorizedText)
print('Shape LSI Topic Matrix: {}'.format(lsiTopicMatrix.shape))

Shape LSI Topic Matrix: (1010, 10)


In [18]:
lsiKeys = lsiTopicMatrix.argmax(axis=1)
lsiCategories, lsiCounts = zip(*Counter(lsiKeys).items())

In [19]:
def getTopWords(n, lsiKeys, vectorizedText, countVectorizer):
    # Create empty array for mean
    wordMean = np.zeros((nTopics, vectorizedText.shape[1]))
    # Iterate over each topic
    for i in np.unique(lsiKeys):
        wordMean[i] += vectorizedText.toarray()[lsiKeys==i].mean(axis=0)
        
    # Sort and get the most frequent n words for each topic
    topWordsIndices = np.flip(np.argsort(wordMean, axis=1)[:, -n:], axis=1)
    topWordsPercentage = (np.divide(np.flip(np.sort(wordMean, axis=1)[:, -n:], axis=1), (np.sum(wordMean, axis=1)+0.0000001)[:, None])*100).astype(int)


    # Store all words for all topics
    topWords = []

    # Iterate over the topics with its indices
    for i, (topic, percentage) in enumerate(zip(topWordsIndices, topWordsPercentage)):
        # Store all words for one topic
        topicWords = []

        if i in np.unique(lsiKeys):
            # Iterate over the indices for the topic
            for index, percent in zip(topic, percentage):
                # Create a wordvector for the index
                wordVector = np.zeros((vectorizedText.shape[1]))
                wordVector[index] = 1
                # Inverse-transfor the wordvector
                word = countVectorizer.inverse_transform(wordVector)[0][0]
                # Store the word
                topicWords.append('{}% '.format(percent) + word.encode('ascii').decode('utf-8'))
        # Store all words for the topic
        topWords.append(', '.join(topicWords))

    return topWords

In [20]:
# Get top n words
topWords = getTopWords(5, lsiKeys, vectorizedText, countVectorizer)

# Print the topics and its words
for i, words in enumerate(topWords):
    print('Topic {}: {}'.format(i, words))

Topic 0: 2% dont, 1% going, 1% well, 1% know, 1% youre
Topic 1: 18% im, 3% going, 1% sorry, 1% sir, 1% luke
Topic 2: 5% wan, 5% obi, 4% kenobi, 3% help, 2% hes
Topic 3: 23% luke, 3% hold, 1% force, 1% pull, 1% keep
Topic 4: 12% going, 5% blast, 4% close, 3% youre, 2% open
Topic 5: 3% one, 3% sir, 2% red, 2% right, 1% ive
Topic 6: 12% get, 2% back, 1% well, 1% ship, 1% youll
Topic 7: 2% us, 1% station, 1% rebel, 1% ship, 1% base
Topic 8: 14% come, 7% threepio, 3% whats, 2% see, 2% could
Topic 9: 7% artoo, 6% see, 4% lost, 4% yes, 3% along


In [21]:
# Sort data
lsiCategoriesSorted, lsiCountsSorted = zip(*sorted(zip(lsiCategories, lsiCounts)))

# Create labels
topWords = getTopWords(5, lsiKeys, vectorizedText, countVectorizer)
labels = ['Topic {}'.format(i) for i in lsiCategoriesSorted]

# Create colormap
n = nTopics
cmap = get_cmap('viridis')
colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]

# Create plot
data = go.Bar(x = labels,
              y = lsiCountsSorted,
              text = [word for word in topWords if word],
              marker = dict(color = colors))

layout = go.Layout(title = 'Most Frequent LSI Topics In {}'.format(subject),
                   xaxis = dict(title = 'Topic'),
                   yaxis = dict(title = 'Count'))

fig = go.Figure(data=[data], layout=layout)
iplot(fig)

In [22]:
# Transform high dimensional dataset to visualize in 2D
tsneModel = TSNE(n_components=2, perplexity=50, learning_rate=100, n_iter=2000, verbose=1, random_state=0, angle=0.75)
tsneModelVectors = tsneModel.fit_transform(lsiTopicMatrix)

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 1010 samples in 0.040s...
[t-SNE] Computed neighbors for 1010 samples in 0.065s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1010
[t-SNE] Computed conditional probabilities for sample 1010 / 1010
[t-SNE] Mean sigma: 0.028632
[t-SNE] KL divergence after 250 iterations with early exaggeration: 61.878483
[t-SNE] Error after 2000 iterations: 0.567075


In [23]:
# Create colormap
n = nTopics
cmap = get_cmap('tab10')
colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]

# Get n top words
topWords = getTopWords(3, lsiKeys, vectorizedText, countVectorizer)


# Create plot
data = []
# Iterate over each topic
for topic in range(nTopics):
    # Mask for a single topic
    mask = lsiKeys==topic
    # Mask for sampling
    sample_mask = np.zeros(mask.sum()).astype(bool)
    sample_mask[:int(points/nTopics)] = True
    np.random.shuffle(sample_mask)
    
    scatter = go.Scatter(x = tsneModelVectors[mask,0][sample_mask],
                         y = tsneModelVectors[mask,1][sample_mask],
                         name = 'Topic {}: {}'.format(topic, topWords[topic]),
                         mode = 'markers',
                         text = df[mask]['text'][sample_mask],
                         marker = dict(color = colors[topic]))
    data.append(scatter)

layout = go.Layout(title = 't-SNE Clustering of {} LSI Topics'.format(nTopics),
                   showlegend=True,
                   hovermode = 'closest')

fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [24]:
# Create LDA and fit
ldaModel = LatentDirichletAllocation(n_components=nTopics, learning_method='online', random_state=0, verbose=0)
ldaTopicMatrix = ldaModel.fit_transform(vectorizedText)
print('Shape LSI Topic Matrix: {}'.format(ldaTopicMatrix.shape))

Shape LSI Topic Matrix: (1010, 10)


In [25]:
ldaKeys = ldaTopicMatrix.argmax(axis=1)
ldaCategories, ldaCounts = zip(*Counter(ldaKeys).items())

In [26]:
# Get top n words
topWords = getTopWords(5, ldaKeys, vectorizedText, countVectorizer)

# Print the topics and its words
for i, words in enumerate(topWords):
    print('Topic {}: {}'.format(i, words))

Topic 0: 5% im, 2% oh, 2% dont, 2% think, 1% sir
Topic 1: 3% get, 3% ship, 2% one, 1% blast, 0% well
Topic 2: 6% whats, 3% im, 3% three, 2% sorry, 2% red
Topic 3: 3% theyre, 2% hold, 2% didnt, 2% star, 1% death
Topic 4: 2% sir, 1% dont, 1% youre, 1% back, 1% imperial
Topic 5: 3% going, 2% right, 2% well, 1% youre, 1% look
Topic 6: 2% station, 2% yes, 2% battle, 1% sir, 1% side
Topic 7: 3% come, 2% cant, 2% see, 2% go, 2% got
Topic 8: 4% help, 3% obi, 3% wan, 3% kenobi, 2% long
Topic 9: 3% luke, 2% dont, 1% us, 1% force, 1% think


In [27]:
# Sort data
ldaCategoriesSorted, ldaCountsSorted = zip(*sorted(zip(ldaCategories, ldaCounts)))

# Create labels
topWords = getTopWords(5, ldaKeys, vectorizedText, countVectorizer)
labels = ['Topic {}'.format(i) for i in ldaCategoriesSorted]

# Create colormap
n = nTopics
cmap = get_cmap('viridis')
colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]

# Create plot
data = go.Bar(x = labels,
              y = ldaCountsSorted,
              text = [word for word in topWords if word],
              marker = dict(color = colors))

layout = go.Layout(title = 'Most Frequent LDA Topics In {}'.format(subject),
                   xaxis = dict(title = 'Topic'),
                   yaxis = dict(title = 'Count'))

fig = go.Figure(data=[data], layout=layout)
iplot(fig)

In [28]:
tsneModel = TSNE(n_components=2, perplexity=50, learning_rate=100, n_iter=2000, verbose=1, random_state=0, angle=0.75)
tsneModelVectors = tsneModel.fit_transform(ldaTopicMatrix)

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 1010 samples in 0.001s...
[t-SNE] Computed neighbors for 1010 samples in 0.062s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1010
[t-SNE] Computed conditional probabilities for sample 1010 / 1010
[t-SNE] Mean sigma: 0.120236
[t-SNE] KL divergence after 250 iterations with early exaggeration: 51.681850
[t-SNE] Error after 1750 iterations: 0.304519


In [29]:
# Create colormap
n = nTopics
cmap = get_cmap('tab10')
colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]

# Get n top words
topWords = getTopWords(3, ldaKeys, vectorizedText, countVectorizer)


# Create plot
data = []
# Iterate over each topic
for topic in range(nTopics):
    # Mask for a single topic
    mask = ldaKeys==topic
    # Mask for sampling
    sample_mask = np.zeros(mask.sum()).astype(bool)
    sample_mask[:int(points/nTopics)] = True
    np.random.shuffle(sample_mask)
    
    scatter = go.Scatter(x = tsneModelVectors[mask,0][sample_mask],
                         y = tsneModelVectors[mask,1][sample_mask],
                         name = 'Topic {}: {}'.format(topic, topWords[topic]),
                         mode = 'markers',
                         text = df[mask]['text'][sample_mask],
                         marker = dict(color = colors[topic]))
    data.append(scatter)

layout = go.Layout(title = 't-SNE Clustering of {} LDA Topics'.format(nTopics),
                   showlegend=True,
                   hovermode = 'closest')

fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [30]:
# Create PCA and fit
pcaModel = PCA(n_components=nTopics, random_state=0)
pcaTopicMatrix = pcaModel.fit_transform(vectorizedText.toarray())
print('Shape PCA Topic Matrix: {}'.format(pcaTopicMatrix.shape))

Shape PCA Topic Matrix: (1010, 10)


In [31]:
pcaKeys = pcaTopicMatrix.argmax(axis=1)
pcaCategories, pcaCounts = zip(*Counter(pcaKeys).items())

In [32]:
# Get top n words
topWords = getTopWords(5, pcaKeys, vectorizedText, countVectorizer)

# Print the topics and its words
for i, words in enumerate(topWords):
    print('Topic {}: {}'.format(i, words))

Topic 0: 11% im, 3% going, 1% dont, 1% think, 1% sir
Topic 1: 10% dont, 4% come, 2% know, 1% see, 1% like
Topic 2: 1% sir, 1% see, 1% cant, 1% obi, 1% wan
Topic 3: 19% luke, 1% going, 1% come, 1% force, 1% empire
Topic 4: 10% going, 3% youre, 2% theyre, 2% look, 1% uncle
Topic 5: 5% one, 3% got, 2% red, 2% ive, 2% right
Topic 6: 11% get, 3% back, 1% ship, 1% right, 1% got
Topic 7: 3% hes, 3% sir, 1% thats, 1% oh, 1% ive
Topic 8: 2% us, 1% ship, 1% rebel, 1% station, 1% base
Topic 9: 3% well, 2% right, 1% go, 1% youre, 1% ill


In [33]:
# Sort data
pcaCategoriesSorted, pcaCountsSorted = zip(*sorted(zip(pcaCategories, pcaCounts)))

# Create labels
topWords = getTopWords(5, pcaKeys, vectorizedText, countVectorizer)
labels = ['Topic {}'.format(i) for i in pcaCategoriesSorted]

# Create colormap
n = nTopics
cmap = get_cmap('viridis')
colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]

# Create plot
data = go.Bar(x = labels,
              y = pcaCountsSorted,
              text = [word for word in topWords if word],
              marker = dict(color = colors))

layout = go.Layout(title = 'Most Frequent PCA Topics In {}'.format(subject),
                   xaxis = dict(title = 'Topic'),
                   yaxis = dict(title = 'Count'))

fig = go.Figure(data=[data], layout=layout)
iplot(fig)

In [34]:
tsneModel = TSNE(n_components=2, perplexity=50, learning_rate=100, n_iter=2000, verbose=1, random_state=0, angle=0.75)
tsneModelVectors = tsneModel.fit_transform(pcaTopicMatrix)

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 1010 samples in 0.001s...
[t-SNE] Computed neighbors for 1010 samples in 0.070s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1010
[t-SNE] Computed conditional probabilities for sample 1010 / 1010
[t-SNE] Mean sigma: 0.030573
[t-SNE] KL divergence after 250 iterations with early exaggeration: 61.658203
[t-SNE] Error after 2000 iterations: 0.554718


In [35]:
# Create colormap
n = nTopics
cmap = get_cmap('tab10')
colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]

# Get n top words
topWords = getTopWords(3, pcaKeys, vectorizedText, countVectorizer)


# Create plot
data = []
# Iterate over each topic
for topic in range(nTopics):
    # Mask for a single topic
    mask = pcaKeys==topic
    # Mask for sampling
    sample_mask = np.zeros(mask.sum()).astype(bool)
    sample_mask[:int(points/nTopics)] = True
    np.random.shuffle(sample_mask)
    
    scatter = go.Scatter(x = tsneModelVectors[mask,0][sample_mask],
                         y = tsneModelVectors[mask,1][sample_mask],
                         name = 'Topic {}: {}'.format(topic, topWords[topic]),
                         mode = 'markers',
                         text = df[mask]['text'][sample_mask],
                         marker = dict(color = colors[topic]))
    data.append(scatter)

layout = go.Layout(title = 't-SNE Clustering of {} PCA Topics'.format(nTopics),
                   showlegend=True,
                   hovermode = 'closest')

fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [36]:
spcaModel = SparsePCA(n_components=nTopics, random_state=0)
spcaTopicMatrix = spcaModel.fit_transform(vectorizedText.toarray())
print('Shape SPCA Topic Matrix: {}'.format(spcaTopicMatrix.shape))

Shape SPCA Topic Matrix: (1010, 10)


In [37]:
spcaKeys = spcaTopicMatrix.argmax(axis=1)
spcaCategories, spcaCounts = zip(*Counter(spcaKeys).items())

In [38]:
# Get top n words
topWords = getTopWords(5, spcaKeys, vectorizedText, countVectorizer)

# Print the topics and its words
for i, words in enumerate(topWords):
    print('Topic {}: {}'.format(i, words))

Topic 0: 1% going, 1% luke, 1% get, 0% right, 0% come
Topic 1: 10% im, 6% sir, 1% sure, 1% luke, 0% thats
Topic 2: 4% youre, 4% wan, 4% obi, 4% kenobi, 3% help
Topic 3: 
Topic 4: 9% dont, 4% think, 3% know, 1% get, 1% worry
Topic 5: 10% one, 1% youre, 1% got, 0% see, 0% take
Topic 6: 
Topic 7: 
Topic 8: 
Topic 9: 10% well, 1% get, 1% going, 1% hes, 1% come


In [39]:
# Transform high dimensional dataset to visualize in 2D
tsneModel = TSNE(n_components=2, perplexity=50, learning_rate=100, n_iter=2000, verbose=1, random_state=0, angle=0.75)
tsneModelVectors = tsneModel.fit_transform(spcaTopicMatrix)

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 1010 samples in 0.002s...
[t-SNE] Computed neighbors for 1010 samples in 0.053s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1010
[t-SNE] Computed conditional probabilities for sample 1010 / 1010
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 53.623669
[t-SNE] Error after 2000 iterations: 0.514924


In [40]:
# Create colormap
n = nTopics
cmap = get_cmap('tab10')
colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]

# Get n top words
topWords = getTopWords(3, spcaKeys, vectorizedText, countVectorizer)


# Create plot
data = []
# Iterate over each topic
for topic in range(nTopics):
    # Mask for a single topic
    mask = spcaKeys==topic
    # Mask for sampling
    sample_mask = np.zeros(mask.sum()).astype(bool)
    sample_mask[:int(points/nTopics)] = True
    np.random.shuffle(sample_mask)
    
    scatter = go.Scatter(x = tsneModelVectors[mask,0][sample_mask],
                         y = tsneModelVectors[mask,1][sample_mask],
                         name = 'Topic {}: {}'.format(topic, topWords[topic]),
                         mode = 'markers',
                         text = df[mask]['text'][sample_mask],
                         marker = dict(color = colors[topic]))
    data.append(scatter)

layout = go.Layout(title = 't-SNE Clustering of {} SPCA Topics'.format(nTopics),
                   showlegend=True,
                   hovermode = 'closest')

fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [41]:
text = "Hey, Han Solo what's up?"

textVector = countVectorizer.transform([text])
newTransformedVector = spcaModel.transform(textVector.toarray())
topic = np.argmax(newTransformedVector)
print('Topic {}: {} '.format(topic, text))

Topic 0: Hey, Han Solo what's up? 
