In [2]:
import numpy as np
import pandas as pd
import nltk
from nltk.stem.snowball import SnowballStemmer
import re
import os
import codecs
from sklearn import feature_extraction

# from bs4 import BeautifulSoup
# import mpld3

In [3]:
df = pd.read_csv('/Users/swimmingcircle/Documents/CS156/fccmediumTitles_Cleaned_Data.tsv', on_bad_lines='skip', sep='\t')

In [4]:
df

Unnamed: 0,Quartier,date,Title,Recommends,Read ratio
0,4,October 2016,Upgrading to MacOS Sierra will break your SSH ...,58,48%
1,4,October 2016,How Crowd Curation Improved Our Search Quality...,28,37%
2,4,October 2016,Code Briefing: What I learned from reviewing 5...,56,59%
3,4,October 2016,What I learned from reviewing 50 portfolios on...,635,49%
4,4,October 2016,JavaScript Fatigue Fatigue In Free Code CampVi...,1100,55%
...,...,...,...,...,...
448,1,October 2015,Gulp! I Improved my Workflow! In Free Code Cam...,49,30%
449,1,October 2015,Beginners Guide to Big O Notation In Free Code...,108,54%
450,1,October 2015,7 Ways Streaming Makes you a Better Coder In F...,142,61%
451,1,October 2015,Jump Start Your Local Campsite with Coffee-and...,50,70%


In [5]:
# load nltk's English stopwords as variable called 'stopwords'
# use nltk.download() to install the corpus first
# Stop Words are words which do not contain important significance to be used in Search Queries
stopwords = nltk.corpus.stopwords.words('english')

# load nltk's SnowballStemmer as variabled 'stemmer'
stemmer = SnowballStemmer("english")

In [6]:
stopwords[:20]

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his']

### Tokenize

- Word tokenize: We use the word_tokenize() method to split a sentence into tokens or words. 
- Sentence tokenize: We use the sent_tokenize() method to split a document or paragraph into sentences. 

In [13]:
sents = [sent for sent in nltk.sent_tokenize(df['Title'][0])]
sents

['Upgrading to MacOS Sierra will break your SSH keys and lock you out of your own servers.',
 'In Free Code CampView storyReferrers']

In [15]:
words = [word for word in nltk.word_tokenize(sents[0])]
words

['Upgrading',
 'to',
 'MacOS',
 'Sierra',
 'will',
 'break',
 'your',
 'SSH',
 'keys',
 'and',
 'lock',
 'you',
 'out',
 'of',
 'your',
 'own',
 'servers',
 '.']

In [16]:

# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
filtered_words = []
for word in words:
        if re.search('[a-zA-Z]', word):
            filtered_words.append(word)
filtered_words

['Upgrading',
 'to',
 'MacOS',
 'Sierra',
 'will',
 'break',
 'your',
 'SSH',
 'keys',
 'and',
 'lock',
 'you',
 'out',
 'of',
 'your',
 'own',
 'servers']

### Stem: find the rooting of the word

- Stemming is the process of reducing inflection in words to their root forms such as mapping a group of words to the same stem even if the stem itself is not a valid word in the Language

In [17]:
# see how "only" is stemmed to "onli" and "wedding" is stemmed to "wed"
stems = [stemmer.stem(t) for t in filtered_words]
stems

['upgrad',
 'to',
 'maco',
 'sierra',
 'will',
 'break',
 'your',
 'ssh',
 'key',
 'and',
 'lock',
 'you',
 'out',
 'of',
 'your',
 'own',
 'server']

In [21]:
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [42]:
tokenize_and_stem(df['Title'][0])

['upgrad',
 'to',
 'maco',
 'sierra',
 'will',
 'break',
 'your',
 'ssh',
 'key',
 'and',
 'lock',
 'you',
 'out',
 'of',
 'your',
 'own',
 'server',
 'in',
 'free',
 'code',
 'campview',
 'storyreferr']

In [23]:
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in df['Title']:
    allwords_stemmed = tokenize_and_stem(i) # for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) # extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [29]:
print(len(totalvocab_stemmed))
print(len(totalvocab_tokenized))


5867
5867


In [30]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
print('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')
print(vocab_frame.head())

there are 5867 items in vocab_frame
            words
upgrad  upgrading
to             to
maco        macos
sierra     sierra
will         will


## Use tf-idf to find the common words

- max_df: this is the maximum frequency within the documents a given feature can have to be used in the tfi-idf matrix. If the term is in greater than 80% of the documents it probably cares little meanining (in the context of film synopses)

- min_idf: this could be an integer (e.g. 5) and the term would have to be in at least 5 of the documents to be considered. Here I pass 0.2; the term must be in at least 20% of the document. I found that if I allowed a lower min_df I ended up basing clustering on names--for example "Michael" or "Tom" are names found in several of the movies and the synopses use these names frequently, but the names carry no real meaning.

- ngram_range: this just means I'll look at unigrams, bigrams and trigrams

In [62]:
# Note that the result of this block takes a while to show
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.05, stop_words='english',
                                 use_idf=True, tokenizer= tokenize_and_stem, ngram_range=(1,3))

tfidf_matrix = tfidf_vectorizer.fit_transform(df['Title']) #fit the vectorizer to synopses

# (100, 563) means the matrix has 100 rows and 563 columns
print(tfidf_matrix.shape)
terms = tfidf_vectorizer.get_feature_names()
len(terms)

(453, 8)




8

In [63]:
tfidf_matrix

<453x8 sparse matrix of type '<class 'numpy.float64'>'
	with 263 stored elements in Compressed Sparse Row format>

In [37]:
terms

['best', 'build', 'design', 'develop', 'javascript', 'learn', 's', 'use']

In [57]:
from sklearn.metrics.pairwise import cosine_similarity
# A short example using the sentences above
words_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

words_matrix = words_vectorizer.fit_transform(sents) #fit the vectorizer to synopses

tfidf_tokens = words_vectorizer.get_feature_names()
df_tfidfvect = pd.DataFrame(data = words_matrix.toarray(),index = ['Sentence 1','Sentence 2'], columns = tfidf_tokens)


print('Sents', sents)
# print('Words_matrix', words_matrix)
print('Shape of words_matrix', words_matrix.shape)


Sents ['Upgrading to MacOS Sierra will break your SSH keys and lock you out of your own servers.', 'In Free Code CampView storyReferrers']
Shape of words_matrix (2, 30)




In [52]:
df_tfidfvect

Unnamed: 0,break,break ssh,break ssh key,campview,campview storyreferr,code,code campview,code campview storyreferr,free,free code,...,sierra,sierra break,sierra break ssh,ssh,ssh key,ssh key lock,storyreferr,upgrad,upgrad maco,upgrad maco sierra
Sentence 1,0.218218,0.218218,0.218218,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.218218,0.218218,0.218218,0.218218,0.218218,0.218218,0.0,0.218218,0.218218,0.218218
Sentence 2,0.0,0.0,0.0,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0


`build_analyzer()` returns a callable that let's you extract the tokenizing step from the transformation pipeline wrapped in the CountVectorizer or TfidfVectorizer. You can do something like this:

`analyze = vectorizer.build_analyzer()`

`df['Text'].apply(lambda x: analyze(x)) #or df['Text'].apply(analyze)`


In [54]:
# this is how we get the 18 terms
analyze = words_vectorizer.build_analyzer()
# print(analyze("Today (May 19, 2016) is his only daughter's wedding."))
# print(analyze("Vito Corleone is the Godfather."))
# print(analyze("Vito's youngest son, Michael, in a Marine Corps uniform, introduces his girlfriend, Kay Adams, to his family at the sprawling reception."))
# all_terms = words_vectorizer.get_feature_names()
# print(all_terms)
# print(len(all_terms))

# sent 1 and 2, similarity 0, sent 1 and 3 shares "his", sent 2 and 3 shares Vito - try to change Vito's in sent3 to His and see the similary matrix changes
example_similarity = cosine_similarity(words_matrix)
example_similarity

array([[1., 0.],
       [0., 1.]])

In [58]:
tfidf_matrix

<453x8 sparse matrix of type '<class 'numpy.float64'>'
	with 263 stored elements in Compressed Sparse Row format>

Next step: embedding the words 
- pretrained embedding: Glove 
- 