# Community Detection
### Stage 1
***

## Importing packages <a class="anchor" id="import"></a>

In [2]:
#files packages
from os import listdir
from os.path import isfile, join
import pathlib

import pandas as pd
import nltk
import string

#stemmer
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

from nltk.stem import WordNetLemmatizer, SnowballStemmer

#stopWords
nltk.download('stopwords')
from nltk.corpus import stopwords
sw = stopwords.words("english")


channels = ['techchap', "dave2d", "ijustine", "mkbhd", "unboxtherapy"]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rama\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
df = pd.DataFrame()
for channel in channels:
    files = [f for f in listdir("Youtube-Data/"+ channel + "/"+ channel + "/" + channel + "-subtitles")]
    for file in files: 
        try:
            with open (join("Youtube-Data/"+ channel + "/"+ channel + "/" + channel + "-subtitles", file), "r") as myfile:
                data=myfile.read().replace('\n', ' ')
            # clean the data from all punctuation 
            data = data.translate(str.maketrans('', '', string.punctuation))

            nltk_tokens = nltk.word_tokenize(data)
            cleanTokens = [x for x in nltk_tokens if not x in sw]

            stems = []
            ps = PorterStemmer()
            for w in cleanTokens:
                stems.append(ps.stem(WordNetLemmatizer().lemmatize(w, pos='v'))) 


            tempData = pd.DataFrame([[file, stems]])

            df = df.append(tempData, ignore_index=True)
        except:
            print(join("Youtube-Data/"+ channel + "/"+ channel + "/" + channel + "-subtitles", file))



Youtube-Data/dave2d/dave2d/dave2d-subtitles\RARKNr6ru-c.txt
Youtube-Data/dave2d/dave2d/dave2d-subtitles\sMib1nMCdfc.txt
Youtube-Data/ijustine/ijustine/ijustine-subtitles\B5a_k25lSAE.txt
Youtube-Data/unboxtherapy/unboxtherapy/unboxtherapy-subtitles\0JzXL_uAHfo.txt
Youtube-Data/unboxtherapy/unboxtherapy/unboxtherapy-subtitles\F1ZT8XyxyH4.txt
Youtube-Data/unboxtherapy/unboxtherapy/unboxtherapy-subtitles\G2ZE5W97kXE.txt
Youtube-Data/unboxtherapy/unboxtherapy/unboxtherapy-subtitles\kdQM2dUHuDc.txt
Youtube-Data/unboxtherapy/unboxtherapy/unboxtherapy-subtitles\l87M93p7PC0.txt
Youtube-Data/unboxtherapy/unboxtherapy/unboxtherapy-subtitles\T1wkbDxCa6o.txt


In [13]:
def preprocess(data):
    data = data.translate(str.maketrans('', '', string.punctuation))

    nltk_tokens = nltk.word_tokenize(data)
    cleanTokens = [x for x in nltk_tokens if not x in sw]

    stems = []
    ps = PorterStemmer()
    for w in cleanTokens:
        stems.append(ps.stem(WordNetLemmatizer().lemmatize(w, pos='v'))) 
        
    return stems


In [14]:
df.columns = ["file", "stems"]

In [15]:
df

Unnamed: 0,file,stems
0,-7gyHZEving.txt,"[hey, guy, anton, tech, chap, new, msi, GF, 65..."
1,-cOYX11AfPc.txt,"[hey, guy, Im, tom, tech, chap, someth, littl,..."
2,-g1mHQwkpQY.txt,"[hi, guy, welcom, back, tech, chap, peopl, thi..."
3,-lCQMFC2D5Q.txt,"[oh, that, way, heavier, I, expect, actual, Im..."
4,-My0ls6Da-c.txt,"[hey, guy, im, tummi, tech, chapman, ive, get,..."
...,...,...
6483,_RU8FktAnlU.txt,"[welcom, back, favorit, seri, unbox, therapi, ..."
6484,_T3uDK90PvA.txt,"[chair, mean, din, read, newspap, still, other..."
6485,_uLIiWSqAAg.txt,"[come, check, So, guy, follow, instagram, youv..."
6486,_VSC4iGYGQA.txt,"[music, what, guy, lew, see, behind, final, ge..."


In [17]:
import gensim
processed_docs = df['stems']

dictionary = gensim.corpora.Dictionary(processed_docs)


In [18]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [19]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [20]:
corpus_tfidf

<gensim.interfaces.TransformedCorpus at 0x13694112fc8>

In [31]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=30, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))


Topic: 0 Word: 0.005*"phone" + 0.003*"camera" + 0.003*"iphon" + 0.002*"galaxi" + 0.002*"ipad" + 0.002*"android" + 0.002*"display" + 0.002*"screen" + 0.002*"pixel" + 0.002*"nexu"
Topic: 1 Word: 0.004*"So" + 0.003*"thi" + 0.002*"and" + 0.002*"it" + 0.002*"iphon" + 0.002*"Oh" + 0.002*"laptop" + 0.002*"music" + 0.002*"the" + 0.002*"you"
Topic: 2 Word: 0.003*"So" + 0.002*"googl" + 0.002*"oh" + 0.002*"game" + 0.002*"thi" + 0.002*"Oh" + 0.002*"phone" + 0.002*"emoji" + 0.002*"he" + 0.002*"app"
Topic: 3 Word: 0.003*"So" + 0.002*"phone" + 0.002*"car" + 0.002*"it" + 0.002*"question" + 0.002*"thi" + 0.002*"iphon" + 0.002*"danc" + 0.002*"Oh" + 0.002*"and"
Topic: 4 Word: 0.005*"So" + 0.004*"thi" + 0.003*"and" + 0.003*"Oh" + 0.003*"it" + 0.002*"phone" + 0.002*"you" + 0.002*"camera" + 0.002*"iphon" + 0.002*"the"
Topic: 5 Word: 0.003*"laptop" + 0.003*"phone" + 0.002*"game" + 0.002*"camera" + 0.002*"So" + 0.002*"batteri" + 0.002*"pixel" + 0.002*"screen" + 0.001*"devic" + 0.001*"appl"
Topic: 6 Word: 0.00

In [32]:
# Compute Perplexity
print('\nPerplexity: ', lda_model_tfidf.log_perplexity(bow_corpus))  # a measure of how good the model is. lower the better.



Perplexity:  -8.229672417381492


In [94]:
# import pyLDAvis
# import pyLDAvis.gensim  # don't skip this

# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim.prepare(lda_model_tfidf, corpus_tfidf, dictionary)
# vis


In [65]:
unseen_document = '''The phones are great but I think they should have made the prices lower, to compete with apple.( coming from a Samsung fanboy)'''
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))


Score: 0.7007796764373779	 Topic: 0.004*"phone" + 0.003*"camera" + 0.003*"iphon" + 0.002*"screen" + 0.002*"fold"
Score: 0.19530367851257324	 Topic: 0.004*"So" + 0.003*"and" + 0.003*"phone" + 0.002*"thi" + 0.002*"headphon"
Score: 0.0037113104481250048	 Topic: 0.005*"phone" + 0.003*"camera" + 0.003*"iphon" + 0.002*"galaxi" + 0.002*"ipad"
Score: 0.0037113104481250048	 Topic: 0.004*"So" + 0.003*"thi" + 0.002*"and" + 0.002*"it" + 0.002*"iphon"
Score: 0.0037113104481250048	 Topic: 0.003*"So" + 0.002*"googl" + 0.002*"oh" + 0.002*"game" + 0.002*"thi"
Score: 0.0037113104481250048	 Topic: 0.003*"So" + 0.002*"phone" + 0.002*"car" + 0.002*"it" + 0.002*"question"
Score: 0.0037113104481250048	 Topic: 0.005*"So" + 0.004*"thi" + 0.003*"and" + 0.003*"Oh" + 0.003*"it"
Score: 0.0037113104481250048	 Topic: 0.003*"laptop" + 0.003*"phone" + 0.002*"game" + 0.002*"camera" + 0.002*"So"
Score: 0.0037113104481250048	 Topic: 0.003*"iphon" + 0.002*"phone" + 0.002*"devic" + 0.002*"laptop" + 0.002*"app"
Score: 0.003

In [66]:
lda_model_tfidf[bow_vector]


[(0, 0.0037117111),
 (1, 0.0037117111),
 (2, 0.0037117111),
 (3, 0.0037117111),
 (4, 0.0037117111),
 (5, 0.0037117111),
 (6, 0.0037117111),
 (7, 0.0037117111),
 (8, 0.0037117111),
 (9, 0.0037117111),
 (10, 0.0037117111),
 (11, 0.0037117111),
 (12, 0.17545666),
 (13, 0.0037117111),
 (14, 0.0037117111),
 (15, 0.0037117111),
 (16, 0.0037117111),
 (17, 0.0037117111),
 (18, 0.0037117111),
 (19, 0.0037117111),
 (20, 0.0037117111),
 (21, 0.0037117111),
 (22, 0.0037117111),
 (23, 0.0037117111),
 (24, 0.0037117111),
 (25, 0.0037117111),
 (26, 0.0037117111),
 (27, 0.0037117111),
 (28, 0.72061545),
 (29, 0.0037117111)]

In [47]:
lda_model_tfidf

<gensim.models.ldamulticore.LdaMulticore at 0x1991f06f708>

In [48]:
bow_vector

[(428, 1), (912, 1), (919, 1), (1185, 1), (1785, 1)]

In [25]:
from gensim.test.utils import datapath

# Save model to disk.
temp_file = datapath("C:/Users/Rama/nlp-datamining/communityDetection/LDAmodel")
lda_model_tfidf.save(temp_file)

#save dictionary to disk
dictionary.save(datapath("C:/Users/Rama/nlp-datamining/communityDetection/dictionary"))

In [26]:
temp_file

'C:/Users/Rama/nlp-datamining/communityDetection/LDAmodel'

In [67]:
df

Unnamed: 0,file,stems
0,-7gyHZEving.txt,"[hey, guy, anton, tech, chap, new, msi, GF, 65..."
1,-cOYX11AfPc.txt,"[hey, guy, Im, tom, tech, chap, someth, littl,..."
2,-g1mHQwkpQY.txt,"[hi, guy, welcom, back, tech, chap, peopl, thi..."
3,-lCQMFC2D5Q.txt,"[oh, that, way, heavier, I, expect, actual, Im..."
4,-My0ls6Da-c.txt,"[hey, guy, im, tummi, tech, chapman, ive, get,..."
...,...,...
6483,_RU8FktAnlU.txt,"[welcom, back, favorit, seri, unbox, therapi, ..."
6484,_T3uDK90PvA.txt,"[chair, mean, din, read, newspap, still, other..."
6485,_uLIiWSqAAg.txt,"[come, check, So, guy, follow, instagram, youv..."
6486,_VSC4iGYGQA.txt,"[music, what, guy, lew, see, behind, final, ge..."


In [72]:
df["topic"] = df['stems'].apply(lambda processedDoc:
    sorted(lda_model_tfidf[dictionary.doc2bow(processedDoc)], key=lambda tup: -1*tup[1])[0]
)


In [73]:
df

Unnamed: 0,file,stems,topic
0,-7gyHZEving.txt,"[hey, guy, anton, tech, chap, new, msi, GF, 65...","(26, 0.5355452)"
1,-cOYX11AfPc.txt,"[hey, guy, Im, tom, tech, chap, someth, littl,...","(14, 0.42125815)"
2,-g1mHQwkpQY.txt,"[hi, guy, welcom, back, tech, chap, peopl, thi...","(26, 0.53523713)"
3,-lCQMFC2D5Q.txt,"[oh, that, way, heavier, I, expect, actual, Im...","(26, 0.5944962)"
4,-My0ls6Da-c.txt,"[hey, guy, im, tummi, tech, chapman, ive, get,...","(26, 0.45443052)"
...,...,...,...
6483,_RU8FktAnlU.txt,"[welcom, back, favorit, seri, unbox, therapi, ...","(29, 0.7858938)"
6484,_T3uDK90PvA.txt,"[chair, mean, din, read, newspap, still, other...","(14, 0.4869414)"
6485,_uLIiWSqAAg.txt,"[come, check, So, guy, follow, instagram, youv...","(28, 0.5572463)"
6486,_VSC4iGYGQA.txt,"[music, what, guy, lew, see, behind, final, ge...","(19, 0.44476616)"


In [79]:
df["topicID"] = df["topic"].apply(lambda x: x[0])
df["confidence"] = df["topic"].apply(lambda x: x[1])

In [83]:
df = df.drop(["topic"],1)
df

Unnamed: 0,file,stems,topicID,confidence
0,-7gyHZEving.txt,"[hey, guy, anton, tech, chap, new, msi, GF, 65...",26,0.535545
1,-cOYX11AfPc.txt,"[hey, guy, Im, tom, tech, chap, someth, littl,...",14,0.421258
2,-g1mHQwkpQY.txt,"[hi, guy, welcom, back, tech, chap, peopl, thi...",26,0.535237
3,-lCQMFC2D5Q.txt,"[oh, that, way, heavier, I, expect, actual, Im...",26,0.594496
4,-My0ls6Da-c.txt,"[hey, guy, im, tummi, tech, chapman, ive, get,...",26,0.454431
...,...,...,...,...
6483,_RU8FktAnlU.txt,"[welcom, back, favorit, seri, unbox, therapi, ...",29,0.785894
6484,_T3uDK90PvA.txt,"[chair, mean, din, read, newspap, still, other...",14,0.486941
6485,_uLIiWSqAAg.txt,"[come, check, So, guy, follow, instagram, youv...",28,0.557246
6486,_VSC4iGYGQA.txt,"[music, what, guy, lew, see, behind, final, ge...",19,0.444766


In [86]:
df["file"] = df["file"].apply(lambda x: x[:-4]) 
df.head()

Unnamed: 0,file,stems,topicID,confidence
0,-7gyHZEving,"[hey, guy, anton, tech, chap, new, msi, GF, 65...",26,0.535545
1,-cOYX11AfPc,"[hey, guy, Im, tom, tech, chap, someth, littl,...",14,0.421258
2,-g1mHQwkpQY,"[hi, guy, welcom, back, tech, chap, peopl, thi...",26,0.535237
3,-lCQMFC2D5Q,"[oh, that, way, heavier, I, expect, actual, Im...",26,0.594496
4,-My0ls6Da-c,"[hey, guy, im, tummi, tech, chapman, ive, get,...",26,0.454431


In [90]:
df = df.rename({"file":"videoId"}, axis='columns')

In [92]:
df.to_csv("videosAndTopics.csv")