In [1]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
import sklearn
import mpld3

In [2]:
df = pd.read_table("./careerData.txt")

In [3]:
df = df[['Title','Description']]

In [4]:
df.columns = ['title','description']

In [5]:
stopwords = nltk.corpus.stopwords.words('english')

In [6]:
stemmer = nltk.stem.snowball.SnowballStemmer('english')

In [7]:
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [8]:
def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [9]:
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in df['description'].values:
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [10]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
tfidf_vectorizer = TfidfVectorizer(
    max_features=200000,
    stop_words='english',
    use_idf=True, 
    tokenizer=tokenize_and_stem, 
    ngram_range=(1,3)
)

In [13]:
tfidf_matrix = tfidf_vectorizer.fit_transform(df['description'].values)

In [14]:
terms = tfidf_vectorizer.get_feature_names()

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
dist = 1 - cosine_similarity(tfidf_matrix)

In [17]:
from sklearn.cluster import DBSCAN

In [49]:
db = DBSCAN(metric='precomputed',eps=0.95,min_samples=5)

In [38]:
db.fit(dist)

DBSCAN(algorithm='auto', eps=0.95, leaf_size=30, metric='precomputed',
    metric_params=None, min_samples=100, n_jobs=1, p=None)

In [43]:
df['label']=db.labels_

In [46]:
df['label'].value_counts()

-1    1110
Name: label, dtype: int64

In [50]:
for i in range(1,99):
    db = DBSCAN(metric='precomputed',eps=i*0.01,min_samples=100)
    db.fit(dist)
    df['label']=db.labels_
    print(df['label'].value_counts())

-1    1110
Name: label, dtype: int64
-1    1110
Name: label, dtype: int64
-1    1110
Name: label, dtype: int64
-1    1110
Name: label, dtype: int64
-1    1110
Name: label, dtype: int64
-1    1110
Name: label, dtype: int64
-1    1110
Name: label, dtype: int64
-1    1110
Name: label, dtype: int64
-1    1110
Name: label, dtype: int64
-1    1110
Name: label, dtype: int64
-1    1110
Name: label, dtype: int64
-1    1110
Name: label, dtype: int64
-1    1110
Name: label, dtype: int64
-1    1110
Name: label, dtype: int64
-1    1110
Name: label, dtype: int64
-1    1110
Name: label, dtype: int64
-1    1110
Name: label, dtype: int64
-1    1110
Name: label, dtype: int64
-1    1110
Name: label, dtype: int64
-1    1110
Name: label, dtype: int64
-1    1110
Name: label, dtype: int64
-1    1110
Name: label, dtype: int64
-1    1110
Name: label, dtype: int64
-1    1110
Name: label, dtype: int64
-1    1110
Name: label, dtype: int64
-1    1110
Name: label, dtype: int64
-1    1110
Name: label, dtype: int64
-