In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import cm
import pandas as pd
import numpy as np
import configparser
import json
import requests
import xmltodict
from bs4 import BeautifulSoup
import time
import pickle
import os
import gensim
import csv
import seaborn as sns
import smart_open
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
import warnings
warnings.filterwarnings('ignore')

# retrieve episode descriptions

In [3]:
fname = 'feed.xml'
url = 'http://dataskeptic.com/feed.rss'

if not(os.path.isfile(fname)):
    print('fetching')
    r = requests.get(url)
    f = open(fname, 'wb')
    f.write(r.text.encode('utf-8'))
    f.close()

with open(fname) as fd:
    xml = xmltodict.parse(fd.read())

In [4]:
test = xml['rss']['channel']['item'][72]['description']
a = BeautifulSoup(test,'lxml').text
a

'Bargaining is the process of two (or more) parties attempting to\nagree on the price for a transaction. \xa0Game theoretic\napproaches attempt to find two strategies from which neither party\nis motivated to deviate. \xa0These strategies are said to be in\nequilibrium with one another. \xa0The equilibriums available in\nbargaining depend on the the transaction mechanism and the\ninformation of the parties. \xa0Discounting (how long parties are\nwilling to wait) has a significant effect in this process.\n\xa0This episode discusses some of the choices Kyle and Linh Da\nmade in deciding what offer to make on a house.'

In [5]:
episodes = xml['rss']['channel']['item']
descriptions = []
descToTitle = {}
descToLink = {}
descToNum = {}
l = len(episodes)
for episode in episodes:
    enclosure = episode['enclosure']
    
    desc = episode['description']
    desc = desc.replace(u'\xa0', u' ')
    desc = desc.replace(u'\n', u' ')
    desc = desc.replace(u'\xc2', u' ')

    
    desc = BeautifulSoup(desc, "lxml").text
    descriptions.append(desc)
    
    descToTitle[desc] = episode['title']
    descToLink[desc] = episode['link']
    descToNum[desc] = l
    l = l - 1

In [6]:
episodes[0].keys()

odict_keys(['title', 'pubDate', 'guid', 'link', 'itunes:image', 'description', 'content:encoded', 'enclosure', 'itunes:duration', 'itunes:explicit', 'itunes:keywords', 'itunes:subtitle', 'itunes:episodeType'])

In [7]:
for desc in descriptions:
    print(descToNum[desc])
    print(descToTitle[desc])
    print(descToLink[desc])    

178
[MINI] One Shot Learning
https://dataskeptic.com/blog/episodes/2017/one-shot-learning
177
Recommender Systems Live from FARCON 2017
https://dataskeptic.com/blog/episodes/2017/recommender-systems-live-from-farcon
176
[MINI] Long Short Term Memory
https://dataskeptic.com/blog/episodes/2017/long-short-term-memory
175
Zillow Zestimate
https://dataskeptic.com/blog/episodes/2017/zillow-zestimate
174
Cardiologist Level Arrhythmia Detection with CNNs
https://dataskeptic.com/blog/episodes/2017/cardiologist-level-arrhythmia-detection-with-cnns
173
[MINI] Recurrent Neural Networks
https://dataskeptic.com/blog/episodes/2017/recurrent-neural-networks
172
Project Common Voice
https://dataskeptic.com/blog/episodes/2017/project-common-voice
171
[MINI] Bayesian Belief Networks
http://dataskeptic.com/blog/episodes/2017/bayesian-belief-networks
170
pix2code
https://dataskeptic.com/blog/episodes/2017/pix2code
169
[MINI] Conditional Independence
http://dataskeptic.com/blog/episodes/2017/conditional-ind

# Save description in txt file.

In [23]:
thefile = open('./text/episode_descs_titles.txt', 'w')

# for i, desc in enumerate(descriptions):
#     desc = desc.encode('utf-8').strip()
    
#     desc = "*"+ str(i)+str(desc).replace('\n', "") 
#     thefile.write("%s\n" % desc)

for i in range(len(descriptions)):
    desc = descriptions[i]
    title = descToTitle[desc]
    
    desc = desc.encode('utf-8').strip()
    desc = str(desc).replace('\n', "") 
    #print(desc)
    title = title.replace('[MINI]', "")
    title = title.encode('utf-8').strip()
    title = "*"+ str(i)+str(title).replace('\n', "") 
    #print(title)
    thefile.write("%s\n" % str(title+", "+desc))
    #print(title+" "+desc)
    

In [24]:
with open('./text/episode_descs_titles.txt', 'r') as f:
    i=0
    for line in f:
        i+=1
        print(i)
        print(line[0:40])

1
*0b'One Shot Learning', b'One Shot Learn
2
*1b'Recommender Systems Live from FARCON
3
*2b'Long Short Term Memory', b'Thanks to
4
*3b'Zillow Zestimate', b'Zillow is a lea
5
*4b'Cardiologist Level Arrhythmia Detect
6
*5b'Recurrent Neural Networks', b'RNNs a
7
*6b'Project Common Voice', b"Thanks to o
8
*7b'Bayesian Belief Networks', b"A Bayes
9
*8b'pix2code', b'In this episode, Tony B
10
*9b'Conditional Independence', b"In stat
11
*10b'Estimating Sheep Pain with Facial R
12
*11b'CosmosDB', b'This episode collects 
13
*12b'The Vanishing Gradient', b'This epi
14
*13b'Doctor AI', b'hen faced with medica
15
*14b'Activation Functions', b'In a neura
16
*15b'MS Build 2017', b'This episode reca
17
*16b'Max-pooling', b"Max-pooling is a pr
18
*17b'Unsupervised Depth Perception', b'T
19
*18b'Convolutional Neural Networks', b"C
20
*19b'Multi-Agent Diverse Generative Adve
21
*20b'Generative Adversarial Networks', b
22
*21b'Opinion Polls for Presidential Elec
23
*22b'OpenHouse', b"No reliable, comple

In [25]:
i # it should be 178 before 9/28/2017.

178

# Use the word vectors trained from SO to represent episode descriptions.

## get word vectors trained from SO

In [26]:
key = 'word2vector_model_question_answer_200_6_2'
fname = './word_vec/'+key+".csv"
word_vecs_df = pd.read_csv(fname,index_col=0)
word_vecs_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
a_,-0.713271,-0.077981,0.010418,-0.697069,-2.598913,-1.080629,-1.033209,0.885287,0.872299,-1.011196,...,0.674927,0.418498,-0.977101,-1.157536,-0.027437,-0.224027,-0.545397,-0.176365,2.447978,0.182951
a__,0.013685,-1.3e-05,-0.025492,-0.012469,-0.00977,-0.009924,-0.004867,0.006001,0.038898,-0.027364,...,0.053458,0.000878,0.002365,-0.048481,-0.054677,-0.030503,-0.012346,0.029457,0.056696,-0.02926
a_a,0.020415,0.156389,-0.060513,-0.028313,-0.116366,-0.042931,-0.130101,-0.006461,0.008261,-0.065245,...,-0.067757,0.007425,-0.111685,0.160803,0.014258,-0.01206,0.078916,0.054315,0.021641,-0.026724
a_adjusted,-0.046735,0.041834,-0.061836,0.02116,0.046803,-0.024107,-0.032979,0.053604,0.037605,-0.08014,...,0.083108,-0.026857,0.024127,0.04053,-0.040803,-0.038954,0.027903,0.024604,-0.02159,-0.05146
a_after_est,-0.011185,0.017912,-0.080105,-0.030186,0.052851,-0.02268,-0.036077,0.007863,-0.008886,-0.044048,...,0.066457,-0.011853,-0.007649,0.017082,-0.00657,-0.020994,0.003839,0.006295,-0.0089,-0.035567


In [27]:
vocab = word_vecs_df.index
len(vocab)

100269

In [28]:
fname = './vocab_dict/vocab_dict_question_answer_200_6_2.csv'
with open(fname, 'r') as csv_file:
    reader = csv.reader(csv_file)
    vocab_dic = dict(reader)

In [29]:
for k, value in vocab_dic.items():
    vocab_dic[k] = int(value)
    

In [30]:
vocab_dic['a_a']

2

# Preprocessing the text in episode descriptions

In [31]:
def read_corpus(fname, tokens_only=False):
    with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            if tokens_only:
                
                yield gensim.utils.simple_preprocess(line)
                #This lowercases, tokenizes, de-accents (optional). – the output are final tokens = unicode strings, that won’t be processed any further.
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])


In [33]:
fname = './text/episode_descs_titles.txt'
episode_desc_title_corpus = list(read_corpus(fname, tokens_only= True))

In [34]:
corpus = []
for desc in episode_desc_title_corpus:
    corpus.append(" ".join(desc))

In [35]:
len(corpus) 

178

In [36]:
corpus[0]

'one shot learning one shot learning is the class of machine learning procedures that focuses learning something from small number of examples this is in contrast to traditional machine learning which typically requires very large training set to build reasonable model in this episode kyle presents coded message to linhda who is able to recognize that many of these new symbols created are likely to be the same symbol despite having extremely few examples of each why can the human brain recognize new symbol with relative ease while most machine learning algorithms require large training data we discuss some of the reasons why and approaches to one shot learning'

# Get tf_idf features of episode descriptions

In [37]:
vectorizer = TfidfVectorizer(min_df=1,vocabulary = vocab_dic)
X = vectorizer.fit_transform(corpus)


Note: [vocabulary] Mapping or iterable, optional
Either a Mapping (e.g., a dict) where keys are terms and values are indices in the feature matrix, or an iterable over terms. If not given, a vocabulary is determined from the input documents.

Which words are in episode descriptions but not in the vocab of SO?


In [38]:
# What words are not in SO?
for i in range(178):
#     print(len(X[i,:].nonzero()[1]))
#     print(len(set(corpus[i].split(' '))))
    diff = set(corpus[i].split(' ')).difference(set(vocab))
    if len(diff) >5:
        print('*******************************************************')
        print(i)
        print(diff)  
        print(descriptions[i])

*******************************************************
22
{'rosevere', 'iamzareenf', 'openhouse', 'zareen', 'joytafty', 'blueplastic', 'dashboarding', 'zehr', 'aglpjrmp', 'periscopedata', 'jawbone'}
No reliable, complete database cataloging home sales data at a transaction level is available for the average person to access. To a data scientist interesting in studying this data, our hands are complete tied. Opportunities like testing sociological theories, exploring economic impacts, study market forces, or simply research the value of an investment when buying a home are all blocked by the lack of easy access to this dataset. OpenHouse seeks to correct that by centralizing and standardizing all publicly available home sales transactional data. In this episode, we discuss the achievements of OpenHouse to date, and what plans exist for the future.     Check out the OpenHouse gallery.    I also encourage everyone to check out the project Zareen mentioned which was her Harry Potter word2

*******************************************************
160
{'outro', 'dicecollector', 'swears', 'zocchi', 'satanic', 'unnoticably', 'awesomedice'}
In this bonus episode, guest Louis Zocchi discusses his background in the gaming industry, specifically, how he became a manufacturer of dice designed to produce statistically uniform outcomes. During the show Louis mentioned a two part video listeners might enjoy: part 1 and part 2 can both be found on youtube. Kyle mentioned a robot capable of unnoticably cheating at Rock Paper Scissors / Ro Sham Bo. More details can be found here. Louis mentioned dice collector Kevin Cook whose website is DiceCollector.com While we're on the subject of table top role playing games, Kyle recommends these two related podcasts listeners might enjoy: The Conspiracy Skeptic podcast (on which host Kyle was recently a guest) had a great episode "Dungeons and Dragons - The Devil's Game?" which explores claims of D&Ds alleged ties to skepticism. Also, Kyle swears

Those words are either people's names, website names and they don't affect the main ideas and I feel it is ok to filter them. 

In [39]:
X.shape

(178, 100269)

# Get weighted doc vectors for all episode description

In [40]:
i=0
episode_desc_corpus[i]

['one',
 'shot',
 'learning',
 'one',
 'shot',
 'learning',
 'is',
 'the',
 'class',
 'of',
 'machine',
 'learning',
 'procedures',
 'that',
 'focuses',
 'learning',
 'something',
 'from',
 'small',
 'number',
 'of',
 'examples',
 'this',
 'is',
 'in',
 'contrast',
 'to',
 'traditional',
 'machine',
 'learning',
 'which',
 'typically',
 'requires',
 'very',
 'large',
 'training',
 'set',
 'to',
 'build',
 'reasonable',
 'model',
 'in',
 'this',
 'episode',
 'kyle',
 'presents',
 'coded',
 'message',
 'to',
 'linhda',
 'who',
 'is',
 'able',
 'to',
 'recognize',
 'that',
 'many',
 'of',
 'these',
 'new',
 'symbols',
 'created',
 'are',
 'likely',
 'to',
 'be',
 'the',
 'same',
 'symbol',
 'despite',
 'having',
 'extremely',
 'few',
 'examples',
 'of',
 'each',
 'why',
 'can',
 'the',
 'human',
 'brain',
 'recognize',
 'new',
 'symbol',
 'with',
 'relative',
 'ease',
 'while',
 'most',
 'machine',
 'learning',
 'algorithms',
 'require',
 'large',
 'training',
 'data',
 'we',
 'discuss',


### How to get the weighted vectors of the episode descriptions?

....


<img src="pictures/tf_idf_matrix.png">
<img src="pictures/word_vec_df.png">


- For example, doc has three words: doc = [word1, word2, word3].
- vec_word_i = [d1, d2, ..., dn] 
- n = size in hidden layer.

- tf_idf_ji = tf_idf of word i in doc_j; Scale them such that sum_i tf_idf_ji = 1. 

- Then the vector of doc_j = sum_i (vec_word_i * tf_dif_ji) which is a vector with the same len as vec_word_i.

In [52]:
def get_doc_weighted_vec(i, doc_corpus , tf_idf = X, weighted = True): # ith documents. doc_corpus a list of words
    
    df = word_vecs_df 
    related_rows = df.loc[sorted(list(set(doc_corpus).intersection(set(vocab)))), :] 
    
    if weighted:
        weights = []
        ind = sorted(tf_idf[i,:].nonzero()[1])
        if sum([vectorizer.vocabulary_[related_rows.index[j]] != ind[j] for j in range(len(ind))]) != 0:
            print("words position don't match")
            return 
        for j in ind:
            weights.append(tf_idf[i,j])
        weights = np.array(weights)/sum(weights)
    else:
        weights = [1/related_rows.shape[0]] * related_rows.shape[0]
    
    if related_rows.shape[0] != len(weights):
        print(i)
        print(related_rows.shape[0])
        print(len(weights))
    
    result = related_rows.T * weights
    return result.sum(axis = 1)

In [53]:
episode_vec_weighted = []
total = len(descriptions)
for i in range(total):
    doc_corpus = episode_desc_title_corpus[i]
    episode_vec_weighted.append(get_doc_weighted_vec(i,doc_corpus))


### Save the episode weighted vectors

In [54]:
def save_obj(obj, name ):
    with open('episode_vec/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f)

def load_obj(name ):
    with open('episode_vec/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [55]:
save_obj(episode_vec_weighted, "episode_vec_weighted_with_title")


In [56]:
episode_vec_weighted_df = pd.DataFrame(episode_vec_weighted)
episode_vec_weighted_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.337494,-0.115356,1.026785,-0.107489,-0.15957,0.359527,-0.204941,0.35471,0.242505,-0.057575,...,0.046193,0.212454,0.271135,0.136556,0.312334,0.064833,-0.219439,-0.167319,-0.217326,-0.125009
1,0.255842,0.453281,0.159527,-0.083451,-0.255821,-0.049511,-0.393035,-0.088309,0.519387,-0.228815,...,0.105291,0.503548,0.27545,-0.112635,0.110989,-0.046437,-0.245155,-0.118622,-0.183394,-0.309248
2,0.0195,0.008531,0.632613,0.258493,-0.546683,0.081739,-0.480826,0.177944,-0.235011,0.023797,...,-0.10622,-0.435328,0.06464,-0.163387,0.121281,-0.212039,0.092798,-0.134013,0.199213,-0.109539
3,0.339768,-0.258667,0.356026,0.006127,0.015253,-0.142278,-0.256672,0.135071,0.493667,-0.256848,...,-0.107504,0.074048,0.203147,0.139729,0.031928,0.128705,-0.039589,-0.156665,-0.439147,-0.071857
4,0.286279,0.266006,0.343129,0.231349,-0.225216,-0.198094,-0.13347,0.084279,0.201282,-0.073215,...,-0.251906,-0.02665,0.073587,-0.253643,0.072823,-0.232809,-0.090379,-0.316556,-0.038844,-0.346698


In [57]:
episode_vec_weighted_df.shape

(178, 200)

# Make recomendation: find related episode

## user's strings: 

some examples

In [58]:

user_requests = [
    "",
    "Are there any episodes on Facial Recognition?",
    "How to know whether the data I am using is valid for my purpose?",
    "Could you recommend some episodes on decision tree and random forests?",
    "Can you talk about Convolutional neural network and recurrent neural network?",
    "Could you recommend some episodes on data science projects for beginners?",
    "artificial intelligence",
    "How can beginners in machine learning, who have finished their MOOCs in machine learning and deep learning, take it to the next level and get to the point of being able to read research papers & productively contribute in an industry?",
    "What can artificial intelligence do for human beings? What is the future of artificial intelligence?",
    "What is natural language processing? ",
    "The error percentage of regression changes with change in the train and test data which I am deciding randomly. Cross validation can overcome this but how do I apply it for my regression model?",
    "I have a precision recall curve for two separate algorithms. If I want to calculate the F-Measure I have to use the precision and recall values at a particular point on each curve. How is this point decided? For example on curve one there is a point where recall is 0.9 and precision is 0.87 and the other curve there is a point of recall at 0.95 and precision at 0.84. Alternatively, should I plot a F-measure curve for every precision recall value?",
    "Suppose I want to make predictions of a response from predictors but I have some autocorrelation in the response variable. Under OLS this would be a problem as the residuals would have autocorrelation. What if I just want to predict the response and I use regularized least squares, like lasso or ridge or elastic net? I don't care about variances of the coefficients or anything of that nature as I'm not testing any hypotheses but I feel like I might be missing something.",
    "Evaluating the quality of data.",
    "I am interested in knowing musical stuff.",
    "Is there any podcast on musical data and musical projects?",
    "What is the trend of big data? What is big data? How to learn big data?",
    "How to learn machine learning? What books or website do you recommend?",
    "Looking for projects on criminal analysis? ",
    "How to take advantage of Internet, computer,  cloud and other  platform in an effective way?",
    "What are the most important knowledge in statistics or probability when doing machine learning?"
]  


    

## Cosine Similarity

In [59]:
all_episode = episode_vec_weighted_df.values
with open('some examples with titles.txt', 'w') as f:
    for j in range(len(user_requests)):
        f.write("*****************************************************" + "\n")
        user_request = user_requests[j]
        user_request_corpus = gensim.utils.simple_preprocess(user_request)
        X_user = vectorizer.fit_transform([" ".join(user_request_corpus)])
        f.write(str(X_user.shape) + "\n")
        user_weighted_vec = get_doc_weighted_vec(0, user_request_corpus , tf_idf = X_user, weighted = True)
        cos_similarities = cosine_similarity(X=user_weighted_vec, Y=all_episode)

        cos_similarities = cos_similarities[0]
        cos_similarities.shape


        most_similar = cos_similarities.argsort()[-4:][::-1]
        f.write(str(most_similar) + "\n")

        threshold = 0.60
        f.write("User's request is: " + user_request + "\n" )
        for i in most_similar:

            if cos_similarities[i] > threshold:
                f.write("--------------------------"+str(cos_similarities[i])+"-----------------------------------\n")
                f.write( "\n")
                f.write(str(descToTitle[descriptions[i]]) + "\n")
                f.write(str(descToLink[descriptions[i]]) + "\n")
                f.write(str(descriptions[i].encode('utf-8')) + "\n")



    

Find the result at some example.txt.  

Since the number of all episode is handlable, let's have a look at the similarity between all episodes. By this, I also want to know the levels of the cosine similarities. 

In [None]:
A = cosine_similarity(X=all_episode)
A.shape

In [None]:
im = plt.imshow(A[20:40,20:40])
plt.colorbar(im)
plt.show()

# very diversity. so it is good.

## New string:

In [None]:
user_request = input('what topics are interesting to you? ')


In [None]:
print("Hello.", user_request)

In [None]:
# to-do: reorganize the code and write a function recommend_episode.

def recommend_episode(string):
    all_episode = episode_vec_weighted_df.values

    
    print("*****************************************************" + "\n")
    user_request = string
    user_request_corpus = gensim.utils.simple_preprocess(user_request)
    X_user = vectorizer.fit_transform([" ".join(user_request_corpus)])
    #print(str(X_user.shape) + "\n")
    user_weighted_vec = get_doc_weighted_vec(0, user_request_corpus , tf_idf = X_user, weighted = True)
    cos_similarities = cosine_similarity(X=user_weighted_vec, Y=all_episode)

    cos_similarities = cos_similarities[0]
    cos_similarities.shape


    most_similar = cos_similarities.argsort()[-4:][::-1]
    #print(str(most_similar) + "\n")

    threshold = 0.60
    print("User's request is: " + user_request + "\n" )
    for i in most_similar:

        if cos_similarities[i] > threshold:
            print("--------------------The episode has cosine similarity is "+str(cos_similarities[i])+" with user's request-------------------------\n")
            print( "\n")
            print(str(descToTitle[descriptions[i]]) + "\n")
            print(str(descToLink[descriptions[i]]) + "\n")
            print(str(descriptions[i].encode('utf-8')) + "\n")



    
    return episode

In [None]:
recommend_episode(user_request)

In [None]:
'adboost' in vocab # so no matter how many times the word 'adboost' is in the string, it won't find 