In [1]:
# gensim
from gensim import corpora, models, similarities, matutils
# sklearn
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
# data
from pymongo import MongoClient

In [2]:
#connect to database
mongo_client = MongoClient()
text_db = mongo_client.s2t

In [3]:
text_db.collection_names()

['videos']

In [4]:
videos = text_db.videos

In [5]:
videos.count()

2166

## Error
It's clear that something went wrong during the transfer of information from the speech API into text and passing those documents into MongoDB. There were 53 videos, so there should be 53 documents. This is something I need to adddress and reprocess/

In [6]:
#extract all of the text so we can put it into Gensim
text_list = []
for doc in videos.find():
    text_list.append(doc['text'])

In [8]:
from sklearn.feature_extraction import text 

#add some extra stop words that we don't want to become features
basic_words = ['number', 'numbers', 'just', 'solution', 'answer',
               'negative', 'know', 'don', 'going', 'want', 'like',
               'work', 'question', 'say', 'think', 'maybe',
              'problem', 'problems', 'equal', 'right', 'actually',
              'really', 'let', 'need', 've', 'real', 'way', 'lot',
              'good', 'equals', 'time', 'things', 'come', 'make', 'll'
              'thing', 'pretty']
stop_words = text.ENGLISH_STOP_WORDS.union(basic_words)

In [9]:
# creating word count vector
count_vectorizer = CountVectorizer(ngram_range=(1, 3),  
                                   stop_words=stop_words, token_pattern="\\b[a-z][a-z]+\\b")
count_vectorizer.fit(text_list)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None,
        stop_words=frozenset({'ours', 'seem', 'about', 'actually', 'been', 'several', 'problems', 'found', 'towards', 'still', 'pretty', 'along', 'equals', 'front', 'everywhere', 'things', 'alone', 'this', 'neither', 'solution', 'hereby', 'noone', 'as', 'whether', 'other', 'couldnt', 'thereby', 'might', 'hi...'besides', 'lot', 'everything', 'during', 'interest', 'three', 'anything', 'our', 'against', 'see'}),
        strip_accents=None, token_pattern='\\b[a-z][a-z]+\\b',
        tokenizer=None, vocabulary=None)

In [10]:
counts = count_vectorizer.transform(text_list).transpose()

In [11]:
counts.shape

(48544, 2166)

In [12]:
# sparse matrix to corpus object
corpus = matutils.Sparse2Corpus(counts)

In [13]:
# store row # and the word/n-gram it contains
id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())

In [14]:
lda = models.LdaModel(corpus=corpus, num_topics=3, id2word=id2word, passes=20, random_state=0)

In [15]:
lda.print_topics()

[(0,
  '0.007*"people" + 0.002*"data" + 0.002*"interview" + 0.002*"talk" + 0.001*"solve" + 0.001*"interviewing" + 0.001*"natural" + 0.001*"did" + 0.001*"interviews" + 0.001*"job"'),
 (1,
  '0.004*"people" + 0.003*"data" + 0.002*"questions" + 0.002*"interview" + 0.001*"ask" + 0.001*"point" + 0.001*"kind" + 0.001*"ll" + 0.001*"understand" + 0.001*"easy"'),
 (2,
  '0.003*"list" + 0.002*"start" + 0.002*"value" + 0.002*"end" + 0.002*"case" + 0.002*"data" + 0.002*"interview" + 0.002*"doing" + 0.002*"return" + 0.002*"candidates"')]

## Conclusion
In retrospect, the topics generated by my choice of content are almost exclusively determined by the title/subject of the video. For example, if you were to explain linear regression in an interview question, you may only hear/say the phrase "linear regression" once or twice. It will be significantly outweighed by the smaller concepts that make up its building blocks. This is interesting in its own regard, but is part of the reaon this project was not able to accomplish what it initially set out to do. 

Further more, garbage in = garbage out. The number and quality of the videos used in this project was not compatible. Had I used many, many more the variety/scope of the questions/interviews would have been acceptable, but with a smaller sample size meaningful results could have only been generated with very targeted video selection. 

If I could alter this project, I would most likely use the Speech API for (real-time) sentiment analysis. Sentiment analysis is very straightforward and the "quality" of content is not under such stringent requirements. 

Also, there is a huge opportunity to make a YouTube channel that focuses on data science interview questions, coding problems, explaining different topics, etc etc. 