In [1]:
import numpy as np
import pandas as pd
import pickle
import re
from tqdm import tqdm
from numpy.random import multinomial
from numpy import log, exp
from numpy import argmax
import json

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

from nltk import word_tokenize, sent_tokenize
import unicodedata

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


#**Step 1: Data Cleaning and Preprocessing**

In [3]:
dataset = pd.read_csv("/content/video_df.csv")

In [29]:
dataset.columns

Index(['Unnamed: 0', 'video_id', 'channelTitle', 'title', 'description',
       'tags', 'publishedAt', 'viewCount', 'likeCount', 'favouriteCount',
       'commentCount', 'duration', 'definition', 'caption'],
      dtype='object')

In [4]:
def clean_text(text):
  text = text.lower().replace("'","").replace('[^\w\s]', ' ').replace(" \d+", " ").strip()
  return text

In [5]:
from nltk.corpus import stopwords
stop_words_list = stopwords.words('english') + ['surfer', 'surf', 'surfing', 'th', 'rd']

In [6]:
def basic_clean(original):
    word = original.lower().strip()
    word = unicodedata.normalize('NFKD', word)\
                                .encode('ascii', 'ignore')\
                                .decode('utf-8', 'ignore')
    word = re.sub(r"[^a-z'\s]", '', word)
    word = word.replace('\n',' ')
    word = word.replace('\t',' ')
    word = word.replace("'","")
    return word

def remove_stopwords(original, extra_words=[], exclude_words=[]):
    stopword_list = stopwords.words('english')

    for word in extra_words:
        stopword_list.append(word)
    for word in exclude_words:
        stopword_list.remove(word)

    words = original.split()
    filtered_words = [w for w in words if w not in stop_words_list]
    filtered_words1 = [w for w in filtered_words if len(w) > 1]

    original_nostop = ' '.join(filtered_words1)

    return original_nostop


def stem(original):
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in original.split()]
    original_stemmed = ' '.join(stems)
    return original_stemmed

In [7]:
for i in dataset['title'][:10]:
  print(i)

Surfing Will Change Your Life For The Better, In Ways You Have Never Imagined
Surfing + Corporate Development = The Perfect Combination
Using Surf Therapy To Help Frontline Workers Thrive | Aussie Frontline
Surfer Escapes Into Nature
Surfer Improves Their Pop Up x100 In One Surf
Party Waves Are The Best Waves!
Progress At An All Time Rate In Here
Wave Pools Are A Surfers Greatest Cheat Code
Special Session Under The Stars
How To Surf From A Beginner To Intermediate In FIVE Days


In [8]:
docs = []
for sentence in dataset['title']:
    words = word_tokenize(stem(remove_stopwords(basic_clean(sentence))))
    docs.append(words)

In [9]:
docs[:10]

[['chang', 'life', 'better', 'way', 'never', 'imagin'],
 ['corpor', 'develop', 'perfect', 'combin'],
 ['use',
  'therapi',
  'help',
  'frontlin',
  'worker',
  'thrive',
  'aussi',
  'frontlin'],
 ['escap', 'natur'],
 ['improv', 'pop', 'one'],
 ['parti', 'wave', 'best', 'wave'],
 ['progress', 'time', 'rate'],
 ['wave', 'pool', 'surfer', 'greatest', 'cheat', 'code'],
 ['special', 'session', 'star'],
 ['beginn', 'intermedi', 'five', 'day']]

#**Step 2: GSMM Topic Modeling**

In [10]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [11]:
class SurfingVideoCoreProcess:
    def __init__(self, K=8, alpha=0.1, beta=0.1, n_iters=30):
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.n_iters = n_iters

        # slots for computed variables
        self.number_docs = None
        self.vocab_size = None
        self.cluster_doc_count = [0 for _ in range(K)]
        self.cluster_word_count = [0 for _ in range(K)]
        self.cluster_word_distribution = [{} for i in range(K)]

    @staticmethod
    def from_data(K, alpha, beta, D, vocab_size, cluster_doc_count, cluster_word_count, cluster_word_distribution):
        '''
        Reconstitute a SurfingVideoCoreProcess from previously fit data
        :param K:
        :param alpha:
        :param beta:
        :param D:
        :param vocab_size:
        :param cluster_doc_count:
        :param cluster_word_count:
        :param cluster_word_distribution:
        :return:
        '''
        svcp = SurfingVideoCoreProcess(K, alpha, beta, n_iters=30)
        svcp.number_docs = D
        svcp.vocab_size = vocab_size
        svcp.cluster_doc_count = cluster_doc_count
        svcp.cluster_word_count = cluster_word_count
        svcp.cluster_word_distribution = cluster_word_distribution
        return svcp

    @staticmethod
    def _sample(p):
        '''
        Sample with probability vector p from a multinomial distribution
        :param p: list
            List of probabilities representing probability vector for the multinomial distribution
        :return: int
            index of randomly selected output
        '''
        return [i for i, entry in enumerate(multinomial(1, p)) if entry != 0][0]

    def fit(self, docs, vocab_size):
        '''
        Cluster the input documents
        :param docs: list of list
            list of lists containing the unique token set of each document
        :param V: total vocabulary size for each document
        :return: list of length len(doc)
            cluster label for each document
        '''
        alpha, beta, K, n_iters, V = self.alpha, self.beta, self.K, self.n_iters, vocab_size

        D = len(docs)
        self.number_docs = D
        self.vocab_size = vocab_size

        # unpack to easy var names
        m_z, n_z, n_z_w = self.cluster_doc_count, self.cluster_word_count, self.cluster_word_distribution
        cluster_count = K
        d_z = [None for i in range(len(docs))]

        # initialize the clusters
        for i, doc in enumerate(docs):

            # choose a random  initial cluster for the doc
            z = self._sample([1.0 / K for _ in range(K)])
            d_z[i] = z
            m_z[z] += 1
            n_z[z] += len(doc)

            for word in doc:
                if word not in n_z_w[z]:
                    n_z_w[z][word] = 0
                n_z_w[z][word] += 1

        for _iter in range(n_iters):
            total_transfers = 0

            for i, doc in enumerate(docs):

                # remove the doc from it's current cluster
                z_old = d_z[i]

                m_z[z_old] -= 1
                n_z[z_old] -= len(doc)

                for word in doc:
                    n_z_w[z_old][word] -= 1

                    # compact dictionary to save space
                    if n_z_w[z_old][word] == 0:
                        del n_z_w[z_old][word]

                # draw sample from distribution to find new cluster
                p = self.score(doc)
                z_new = self._sample(p)

                # transfer doc to the new cluster
                if z_new != z_old:
                    total_transfers += 1

                d_z[i] = z_new
                m_z[z_new] += 1
                n_z[z_new] += len(doc)

                for word in doc:
                    if word not in n_z_w[z_new]:
                        n_z_w[z_new][word] = 0
                    n_z_w[z_new][word] += 1

            cluster_count_new = sum([1 for v in m_z if v > 0])
            print("In stage %d: transferred %d clusters with %d clusters populated" % (
            _iter, total_transfers, cluster_count_new))
            if total_transfers == 0 and cluster_count_new == cluster_count and _iter>25:
                print("Converged.  Breaking out.")
                break
            cluster_count = cluster_count_new
        self.cluster_word_distribution = n_z_w
        return d_z

    def score(self, doc):
        '''
        Score a document

        Implements formula (3) of Yin and Wang 2014.
        http://dbgroup.cs.tsinghua.edu.cn/wangjy/papers/KDD14-GSDMM.pdf

        :param doc: list[str]: The doc token stream
        :return: list[float]: A length K probability vector where each component represents
                              the probability of the document appearing in a particular cluster
        '''
        alpha, beta, K, V, D = self.alpha, self.beta, self.K, self.vocab_size, self.number_docs
        m_z, n_z, n_z_w = self.cluster_doc_count, self.cluster_word_count, self.cluster_word_distribution

        p = [0 for _ in range(K)]

        #  We break the formula into the following pieces
        #  p = N1*N2/(D1*D2) = exp(lN1 - lD1 + lN2 - lD2)
        #  lN1 = log(m_z[z] + alpha)
        #  lN2 = log(D - 1 + K*alpha)
        #  lN2 = log(product(n_z_w[w] + beta)) = sum(log(n_z_w[w] + beta))
        #  lD2 = log(product(n_z[d] + V*beta + i -1)) = sum(log(n_z[d] + V*beta + i -1))

        lD1 = log(D - 1 + K * alpha)
        doc_size = len(doc)
        for label in range(K):
            lN1 = log(m_z[label] + alpha)
            lN2 = 0
            lD2 = 0
            for word in doc:
                lN2 += log(n_z_w[label].get(word, 0) + beta)
            for j in range(1, doc_size +1):
                lD2 += log(n_z[label] + V * beta + j - 1)
            p[label] = exp(lN1 - lD1 + lN2 - lD2)

        # normalize the probability vector
        pnorm = sum(p)
        pnorm = pnorm if pnorm>0 else 1
        return [pp/pnorm for pp in p]

    def choose_best_label(self, doc):
        '''
        Choose the highest probability label for the input document
        :param doc: list[str]: The doc token stream
        :return:
        '''
        p = self.score(doc)
        return argmax(p),max(p)

In [12]:
svcp = SurfingVideoCoreProcess(K=10, alpha=0.1, beta=1, n_iters=30)

vocab = set(x for doc in docs for x in doc)
n_terms = len(vocab)

y = svcp.fit(docs, n_terms)

In stage 0: transferred 7594 clusters with 10 clusters populated
In stage 1: transferred 5007 clusters with 10 clusters populated
In stage 2: transferred 2786 clusters with 10 clusters populated
In stage 3: transferred 1767 clusters with 10 clusters populated
In stage 4: transferred 1361 clusters with 10 clusters populated
In stage 5: transferred 1138 clusters with 10 clusters populated
In stage 6: transferred 1007 clusters with 10 clusters populated
In stage 7: transferred 966 clusters with 9 clusters populated
In stage 8: transferred 950 clusters with 9 clusters populated
In stage 9: transferred 920 clusters with 9 clusters populated
In stage 10: transferred 850 clusters with 8 clusters populated
In stage 11: transferred 891 clusters with 8 clusters populated
In stage 12: transferred 904 clusters with 8 clusters populated
In stage 13: transferred 864 clusters with 9 clusters populated
In stage 14: transferred 858 clusters with 7 clusters populated
In stage 15: transferred 805 cluster

In [13]:
doc_count = np.array(svcp.cluster_doc_count)
print('Number of documents per topic :', doc_count)

Number of documents per topic : [   0    5  549    0 1926    0 3631 2013  505  192]


In [14]:
top_index = doc_count.argsort()[-15:][::-1]
print('Most important clusters (by number of docs inside):', top_index)

Most important clusters (by number of docs inside): [6 7 4 2 8 9 1 5 3 0]


In [15]:
def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts =sorted(svcp.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print('Cluster %s : %s'%(cluster,sort_dicts))
        print('-'*120)

In [16]:
top_words(svcp.cluster_word_distribution, top_index,10)

Cluster 6 : [('wave', 675), ('session', 242), ('day', 214), ('big', 214), ('ep', 186), ('best', 158), ('barrel', 157), ('surfer', 156), ('portug', 139), ('kai', 138)]
------------------------------------------------------------------------------------------------------------------------
Cluster 7 : [('rip', 1409), ('curl', 1390), ('pro', 421), ('gromsearch', 289), ('day', 180), ('final', 179), ('search', 176), ('mirag', 175), ('highlight', 170), ('present', 153)]
------------------------------------------------------------------------------------------------------------------------
Cluster 4 : [('pipelin', 1132), ('jan', 373), ('dec', 369), ('angl', 354), ('backdoor', 315), ('john', 203), ('wave', 199), ('januari', 175), ('florenc', 147), ('feb', 144)]
------------------------------------------------------------------------------------------------------------------------
Cluster 2 : [('cam', 90), ('surflin', 80), ('rewind', 63), ('swell', 60), ('north', 60), ('live', 54), ('shore', 53)

In [17]:
topic_dict = {}
topic_names = ['Topic #1',
               'Topic #2',
               'Topic #3',
               'Topic #4',
               'Topic #5',
               'Topic #6',
               'Topic #7',
               'Topic #8',
               'Topic #9',
               'Topic #10',
               'Topic #11',
               'Topic #12',
               'Topic #13',
               'Topic #14',
               'Topic #15'
              ]
for i, topic_num in enumerate(top_index):
    topic_dict[topic_num]=topic_names[i]

In [18]:
topic_dict

{6: 'Topic #1',
 7: 'Topic #2',
 4: 'Topic #3',
 2: 'Topic #4',
 8: 'Topic #5',
 9: 'Topic #6',
 1: 'Topic #7',
 5: 'Topic #8',
 3: 'Topic #9',
 0: 'Topic #10'}

In [21]:
def create_topics_dataframe(data_text=dataset['title'],  svcp=svcp, threshold=0.3, topic_dict=topic_dict, stem_text=docs):
    result = pd.DataFrame(columns=['text', 'topic', 'stems'])
    for i, text in enumerate(data_text):
        result.at[i, 'text'] = text
        result.at[i, 'stems'] = stem_text[i]
        prob = svcp.choose_best_label(stem_text[i])
        if prob[1] >= threshold:
            result.at[i, 'topic'] = topic_dict[prob[0]]
        else:
            result.at[i, 'topic'] = 'Other'
    return result

In [22]:
dfx = create_topics_dataframe(data_text=dataset['title'],  svcp=svcp, threshold=0.3, topic_dict=topic_dict, stem_text=docs)

In [23]:
dfx

Unnamed: 0,text,topic,stems
0,"Surfing Will Change Your Life For The Better, In Ways You Have Never Imagined",Topic #1,"[chang, life, better, way, never, imagin]"
1,Surfing + Corporate Development = The Perfect Combination,Topic #2,"[corpor, develop, perfect, combin]"
2,Using Surf Therapy To Help Frontline Workers Thrive | Aussie Frontline,Topic #4,"[use, therapi, help, frontlin, worker, thrive, aussi, frontlin]"
3,Surfer Escapes Into Nature,Topic #2,"[escap, natur]"
4,Surfer Improves Their Pop Up x100 In One Surf,Topic #1,"[improv, pop, one]"
5,Party Waves Are The Best Waves!,Topic #1,"[parti, wave, best, wave]"
6,Progress At An All Time Rate In Here,Topic #1,"[progress, time, rate]"
7,Wave Pools Are A Surfers Greatest Cheat Code,Topic #1,"[wave, pool, surfer, greatest, cheat, code]"
8,Special Session Under The Stars,Topic #1,"[special, session, star]"
9,How To Surf From A Beginner To Intermediate In FIVE Days,Topic #1,"[beginn, intermedi, five, day]"


In [24]:
dfx.topic.value_counts(dropna=False)

Topic #1    3738
Topic #2    1988
Topic #3    1896
Topic #4     533
Topic #5     480
Topic #6     180
Other          3
Topic #7       3
Name: topic, dtype: int64

###**Conclusions:**###
The number of topics highlighted by the GSMM analysis makes more sense to me given that there are 6 main categories.

Looking at the words associated with the categories, these are likely to be what the topics are about:

- Topic 1: Big wave surfing at Nazare
- Topic 2: RipCurl Commpetition
- Topic 3: Surfing at Pipeline
- Topic 4: Fun surfing in the States
- Topic 5: Competitions ([Kelly Slater](https://en.wikipedia.org/wiki/Kelly_Slater) (the GOAT) features strongly)
- Topic 6: Fun surfing in Bali
- Topic 7: Surf Training and techniques


Again, pro surfers and locations show the categories strongly.