In [189]:
import pandas as pd
import nltk
import ast
import numpy as np

#nltk.download('stopwords') #Only needed if exception thrown
#nltk.download('punkt')#Only needed if exception thrown
from nltk.corpus import stopwords
#from nltk.tokenize import word_tokenize 

#Using a GLobal Variable to Filter Stop Words
stop_words = set(stopwords.words('english')) 

import contractions 
import itertools

from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

#### Launch Spark

In [52]:
# start Spark Session
from pyspark.sql import SparkSession
app_name = "capstone_eda"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

#### Helper Function for Filtering Questions

In [228]:
def filter_stop_words(sentence):
    """
        Removes Stop Words / I / Punctuation 
        
        Input:
            Sentence: String 
        
        Ouput:
            Set of Unique Words in input
    """
    sentence = contractions.fix(sentence)#Converts contractsion to real words i.e. "don't" to "do not" 
    word_tokens = tknzr.tokenize(sentence) 
    filtered_sentence = [w.lower() for w in word_tokens if not w in stop_words and w.isalpha() and w.lower()!='i'] 
    filtered_sentence = list(set(filtered_sentence))
    return filtered_sentence

In [250]:
def print_results(data_frame,number_to_print=5,metric_type='Jaccard',filter_function=filter_stop_words):
    for i in range(number_to_print):
        Q1 = int(data_frame[i][0][0])
        Q2 = int(data_frame[i][0][1])
        Score = data_frame[i][1]
        print(metric_type+' Similarity Ranking: '+str(i+1)+' Score: {:0.05f}'.format(Score))
        print('\n') 
        print("Question 1")
        print(questions_data_df.full_question[Q1])
        print('\n') 
        print(filter_function(questions_data_df.full_question[Q1]))
        print('\n')
        print("Question 2")
        print(questions_data_df.full_question[Q2])
        print('\n')
        print(filter_function(questions_data_df.full_question[Q2]))
        print('__________________________________________________')
        print('\n')

#### Loads Questions Data

In [106]:
questions_data = './data/questions.csv'
questions_data_df = pd.read_csv(questions_data)
questions_data_df['full_question'] = questions_data_df.questions_title+' '+questions_data_df.questions_body
questions_data_df.head()

Unnamed: 0,questions_id,questions_author_id,questions_date_added,questions_title,questions_body,full_question
0,332a511f1569444485cf7a7a556a5e54,8f6f374ffd834d258ab69d376dd998f5,2016-04-26 11:14:26 UTC+0000,Teacher career question,What is a maths teacher? what is a ma...,Teacher career question What is a maths...
1,eb80205482e4424cad8f16bc25aa2d9c,acccbda28edd4362ab03fb8b6fd2d67b,2016-05-20 16:48:25 UTC+0000,I want to become an army officer. What can I d...,I am Priyanka from Bangalore . Now am in 10th ...,I want to become an army officer. What can I d...
2,4ec31632938a40b98909416bdd0decff,f2c179a563024ccc927399ce529094b5,2017-02-08 19:13:38 UTC+0000,Will going abroad for your first job increase ...,I'm planning on going abroad for my first job....,Will going abroad for your first job increase ...
3,2f6a9a99d9b24e5baa50d40d0ba50a75,2c30ffba444e40eabb4583b55233a5a4,2017-09-01 14:05:32 UTC+0000,To become a specialist in business management...,i hear business management is a hard way to ge...,To become a specialist in business management...
4,5af8880460c141dbb02971a1a8369529,aa9eb1a2ab184ebbb00dc01ab663428a,2017-09-01 02:36:54 UTC+0000,Are there any scholarships out there for stude...,I'm trying to find scholarships for first year...,Are there any scholarships out there for stude...


## Does Small Sample to Test code

In [107]:
sentence_data = questions_data_df['full_question'][1:500].apply(filter_stop_words)
sentence_data.to_csv(r'./data/sample_sentence_data.txt', header=None, sep=' ', mode='a')

In [234]:
# load data into Spark
data = sc.textFile("./data/sample_sentence_data.txt")

In [126]:
#Creates Helper Mapper Functions for computing similarity scores
def splitWords(pair):
    """Tokenize each question and emit postings."""
    doc, text,_ = pair
    words = ast.literal_eval(text)
    for w in words:
        yield (w, [(doc,len(words))])
        
def makeCompositeKey(inverted_index):
    """Loop over postings and yield pairs."""
    word, postings = inverted_index
    # taking advantage of symmetry, output only (a,b), but not (b,a)
    for subset in itertools.combinations(sorted(postings), 2):
        yield (str(subset), 1)

def jaccard(line):
    """Compute similarity scores"""
    (doc1, n1), (doc2, n2) = ast.literal_eval(line[0])
    total = int(line[1])
    jaccard = total / float(int(n1) + int(n2) - total)
    yield (doc1,doc2), jaccard
    

## Caclulate Similarity

In [235]:
result = data.map(lambda line: line.split('\"')) \
             .flatMap(splitWords) \
             .reduceByKey(lambda x,y : x+y) \
             .flatMap(makeCompositeKey) \
             .reduceByKey(lambda x,y : x+y) \
             .flatMap(jaccard) \
             .takeOrdered(10, key=lambda x: -x[1])

result

[(('182 ', '489 '), 0.5714285714285714),
 (('443 ', '489 '), 0.5294117647058824),
 (('182 ', '443 '), 0.48214285714285715),
 (('245 ', '476 '), 0.3333333333333333),
 (('457 ', '476 '), 0.3333333333333333),
 (('165 ', '447 '), 0.3125),
 (('357 ', '457 '), 0.3125),
 (('271 ', '490 '), 0.3076923076923077),
 (('150 ', '277 '), 0.3076923076923077),
 (('197 ', '354 '), 0.29411764705882354)]

In [251]:
print_results(result)

Jaccard Similarity Ranking: 1 Score: 0.57143


Question 1
Is geology a good major? Things you can consider for this specific question...

Are there many job opportunities in geology?
Is there a lot of room for growth in the field of geology?
If you majored in geology, are you happy with your choice of major?

I'm a CareerVillage staff member and I'm posting this because we know that many young people are looking for the answer to this question. This is among the most popular questions searched by youth, and we're hoping you will take a moment to share your response to it. Thank you! #geology #science #earth #earth-science 


['choice', 'popular', 'response', 'hoping', 'is', 'if', 'happy', 'geology', 'thank', 'questions', 'many', 'posting', 'job', 'growth', 'answer', 'question', 'are', 'searched', 'consider', 'majored', 'this', 'good', 'staff', 'room', 'share', 'specific', 'careervillage', 'lot', 'member', 'looking', 'major', 'opportunities', 'moment', 'among', 'young', 'take', 'know', 

While these results show that our Jaccard Score is working properly, we can see that is being incorrectly linking questions because of the presence of the phrases:

<ul>
<li><i>   "I'm a CareerVillage staff member and I'm posting this because we know that many young people are looking for the answer to this question. This is among the most popular questions searched by youth, and we're hoping you will take a moment to share your response to it. Thank you!"</i>

<li><i>Things you can consider for this specific question...</i>
</ul>
As a result, we need to remove this phrase and redo our pre-processing....

In [169]:
#Updates Code to Remove these Bad Phrases
bad_phrases = [
                "I'm a CareerVillage staff member and I'm posting this because we know that many young people are looking for the answer to this question. This is among the most popular questions searched by youth, and we're hoping you will take a moment to share your response to it. Thank you!",
                "Things you can consider for this specific question..."
              ]

def filter_stop_words(sentence,bad_phrases=bad_phrases):
    """
        Removes Stop Words / I / Punctuation 
        
        Input:
            Sentence: String 
        
        Ouput:
            Set of Unique Words in input
    """
    for i in bad_phrases: 
        sentence = sentence.replace(i,'')
    sentence = contractions.fix(sentence)#Converts contractsion to real words i.e. "don't" to "do not" 
    word_tokens = tknzr.tokenize(sentence) 
    filtered_sentence = [w.lower() for w in word_tokens if not w in stop_words and w.isalpha() and w.lower()!='i'] 
    filtered_sentence = list(set(filtered_sentence))
    return filtered_sentence

## Redoes Small Sample to Test code

In [170]:
sentence_data = questions_data_df['full_question'][1:500].apply(filter_stop_words)
sentence_data.to_csv(r'./data/sample_sentence_datav2.txt', header=None, sep=' ', mode='a')

In [171]:
# load data into Spark
data2 = sc.textFile("./data/sample_sentence_datav2.txt")

In [240]:
result2 = data2.map(lambda line: line.split('\"')) \
             .flatMap(splitWords) \
             .reduceByKey(lambda x,y : x+y) \
             .flatMap(makeCompositeKey) \
             .reduceByKey(lambda x,y : x+y) \
             .flatMap(jaccard) \
             .takeOrdered(10, key=lambda x: -x[1])

result2

[(('245 ', '476 '), 0.3333333333333333),
 (('457 ', '476 '), 0.3333333333333333),
 (('165 ', '447 '), 0.3125),
 (('357 ', '457 '), 0.3125),
 (('271 ', '490 '), 0.3076923076923077),
 (('150 ', '277 '), 0.3076923076923077),
 (('197 ', '354 '), 0.29411764705882354),
 (('202 ', '467 '), 0.29411764705882354),
 (('467 ', '69 '), 0.2857142857142857),
 (('354 ', '490 '), 0.2857142857142857)]

In [241]:
print_results(result2)

Jaccard Similarity Ranking: 1 Score: 0.33333


Question 1
How do I successfully apply to college? college applications #college #career #graduate 


['applications', 'apply', 'successfully', 'college', 'how']


Question 2
How to study in College? #engineer 


['study', 'college', 'how']
__________________________________________________


Jaccard Similarity Ranking: 2 Score: 0.33333


Question 1
What are some good study habits to have when you are in college? How can I improve my studying once I get to college. #help 


['how', 'studying', 'improve', 'get', 'what', 'habits', 'study', 'college', 'good']


Question 2
How to study in College? #engineer 


['study', 'college', 'how']
__________________________________________________


Jaccard Similarity Ranking: 3 Score: 0.31250


Question 1
How does a master degree affect your career and pay? Accounting student at Towson university  #accounting


['university', 'degree', 'master', 'pay', 'towson', 'career', 'student', 'affect', 'how', 'a

These results are more in line with expectations as while some high variability in releavance between the two "two similar" questions. For example in the first example, the second question has three relevant words (Study,How, College) of which 2 of those appear in Q1, so the score is high as 2/3 words appear in the first question even in the relavance is not overly high.

Then looking at the last sentence we can see that both have high school in them, but the meaning of the two is very different.

How would the results change if we used a different similarity metric, such as Cosine similarity.

### Cosine Similarity

In [190]:
def cosine(line):
    """Compute similarity scores"""
    (doc1, n1), (doc2, n2) = ast.literal_eval(line[0])
    total = int(line[1])
    cosine = total / np.sqrt(int(n1)*int(n2))
    yield (doc1,doc2), cosine

In [243]:
result3 = data2.map(lambda line: line.split('\"')) \
             .flatMap(splitWords) \
             .reduceByKey(lambda x,y : x+y) \
             .flatMap(makeCompositeKey) \
             .reduceByKey(lambda x,y : x+y) \
             .flatMap(cosine) \
             .takeOrdered(10, key=lambda x: -x[1])

#Evaluate Our top 3 Similar Documents
print_results(result3,metric_type="Cosine")

Cosine Similarity Ranking: 1 Score: 0.57735


Question 1
What are some good study habits to have when you are in college? How can I improve my studying once I get to college. #help 


['how', 'studying', 'improve', 'get', 'what', 'habits', 'study', 'college', 'good']


Question 2
How to study in College? #engineer 


['study', 'college', 'how']
__________________________________________________


Cosine Similarity Ranking: 2 Score: 0.51640


Question 1
How do I successfully apply to college? college applications #college #career #graduate 


['applications', 'apply', 'successfully', 'college', 'how']


Question 2
How to study in College? #engineer 


['study', 'college', 'how']
__________________________________________________


Cosine Similarity Ranking: 3 Score: 0.48113


Question 1
What study habits are most important in college? As a homeschool student, I get concerned that I won't meet expectations in college.
#studyhabits 


['meet', 'important', 'concerned', 'as', 'homeschool',

As we can see, some inconsitency remains. As a new step, let's shift from using unigrams to bigrams and see how things change.

In [261]:
#Updates Code to Remove these Bad Phrases
bad_phrases = [
                "I'm a CareerVillage staff member and I'm posting this because we know that many young people are looking for the answer to this question. This is among the most popular questions searched by youth, and we're hoping you will take a moment to share your response to it. Thank you!",
                "Things you can consider for this specific question..."
              ]

def create_bigrams(sentences,bad_phrases=bad_phrases,gram_size=2):
    """
        Removes Stop Words / I / Punctuation 
        
        Input:
            Sentence: String 
        
        Ouput:
            Set of Unique Words in input
    """
    for i in bad_phrases: 
        sentences = sentences.replace(i,'')
    sentences = nltk.sent_tokenize(sentences)
    grams = []
    for sentence in sentences:
        sentence = contractions.fix(sentence)
        word_tokens = tknzr.tokenize(sentence) 
        filtered_sentence = [w.lower() for w in word_tokens if not w in stop_words and w.isalpha() and w.lower()!='i'] 
        ngrams = zip(*[filtered_sentence[i:] for i in range(gram_size)])
        grams += [" ".join(ngram) for ngram in ngrams]
    return list(set(grams))

In [277]:
sentence_data = questions_data_df['full_question'][1:500].apply(create_bigrams)
sentence_data.to_csv(r'./data/sample_sentence_datav3.txt', header=None, sep=' ', mode='a')
data3 = sc.textFile("./data/sample_sentence_datav3.txt")

In [264]:
result4 = data3.map(lambda line: line.split('\"')) \
             .flatMap(splitWords) \
             .reduceByKey(lambda x,y : x+y) \
             .flatMap(makeCompositeKey) \
             .reduceByKey(lambda x,y : x+y) \
             .flatMap(cosine) \
             .takeOrdered(10, key=lambda x: -x[1])

In [265]:
#Evaluate Our top 3 Similar Documents
print_results(result4,metric_type="Cosine",filter_function=create_bigrams)

Cosine Similarity Ranking: 1 Score: 1.37199


Question 1
How do I apply to become a flight attendant? I want to work and travel for a couple of years before applying to college. This question was posted by a CareerVillage administrator on behalf of the students of CareerVillage. #aviation #airline-industry #flight-attendant


['this question', 'want work', 'work travel', 'apply become', 'careervillage administrator', 'students careervillage', 'become flight', 'applying college', 'administrator behalf', 'posted careervillage', 'years applying', 'how apply', 'flight attendant', 'behalf students', 'couple years', 'travel couple', 'question posted']


Question 2
What made you want to work in the airline industry? <p>I have no idea what I want to do when I grow up. I don't really even know where to start. I do love to travel, so I thought I'd ask, what made you want to work in this industry? If you have any advice as to how I should go about picking a career within the industry, please let 

Using questions with bigrams vs. unigrams results in questions thaqt align, when you look at thinhs that become how many years does it take, but does not correctly distinguish between specialties. 

How do things change if we used Jaccard rather than Cosine here?

In [266]:
result5 = data3.map(lambda line: line.split('\"')) \
             .flatMap(splitWords) \
             .reduceByKey(lambda x,y : x+y) \
             .flatMap(makeCompositeKey) \
             .reduceByKey(lambda x,y : x+y) \
             .flatMap(jaccard) \
             .takeOrdered(10, key=lambda x: -x[1])

print_results(result5,metric_type="Jaccard",filter_function=create_bigrams)

Jaccard Similarity Ranking: 1 Score: 2.00000


Question 1
How does a master degree affect your career and pay? Accounting student at Towson university  #accounting


['master degree', 'student towson', 'affect career', 'degree affect', 'towson university', 'career pay', 'how master', 'accounting student']


Question 2
What is the difference between the certificates and associate in science (AS) degree in accounting? Accounting student at Towson university  #accounting


['degree accounting', 'what difference', 'science as', 'certificates associate', 'student towson', 'as degree', 'associate science', 'towson university', 'difference certificates', 'accounting student']
__________________________________________________


Jaccard Similarity Ranking: 2 Score: 1.88235


Question 1
How do I apply to become a flight attendant? I want to work and travel for a couple of years before applying to college. This question was posted by a CareerVillage administrator on behalf of the students of Car

There is some improvement, but still somethings are lacking. Lets look at a combination of unigrams and bigrams to see if that provides meaningful improvement.

In [271]:
def create_multi_grams(sentences,bad_phrases=bad_phrases,gram_size=[1,2]):
    """
        Removes Stop Words / I / Punctuation 
        
        Input:
            Sentence: String 
        
        Ouput:
            Set of Unique Words in input
    """
    for i in bad_phrases: 
        sentences = sentences.replace(i,'')
    sentences = nltk.sent_tokenize(sentences)
    grams = []
    for sentence in sentences:
        sentence = contractions.fix(sentence)
        word_tokens = tknzr.tokenize(sentence) 
        filtered_sentence = [w.lower() for w in word_tokens if not w in stop_words and w.isalpha() and w.lower()!='i'] 
        for i in gram_size:
            ngrams = zip(*[filtered_sentence[i:] for i in range(i)])
            grams += [" ".join(ngram) for ngram in ngrams]
    return list(set(grams))

In [279]:
sentence_data = questions_data_df['full_question'][1:500].apply(create_multi_grams)
sentence_data.to_csv(r'./data/sample_sentence_datav4.txt', header=None, sep=' ', mode='a')
data4 = sc.textFile("./data/sample_sentence_datav4.txt")

In [284]:
result6 = data4.map(lambda line: line.split('\"')) \
             .flatMap(splitWords) \
             .reduceByKey(lambda x,y : x+y) \
             .flatMap(makeCompositeKey) \
             .reduceByKey(lambda x,y : x+y) \
             .flatMap(jaccard) \
             .takeOrdered(10, key=lambda x: -x[1])

In [285]:
print_results(result6,metric_type="Jaccard",filter_function=create_multi_grams)

Jaccard Similarity Ranking: 1 Score: 0.25806


Question 1
How does a master degree affect your career and pay? Accounting student at Towson university  #accounting


['university', 'degree', 'master', 'pay', 'towson', 'master degree', 'student towson', 'career', 'affect career', 'student', 'degree affect', 'affect', 'towson university', 'career pay', 'how master', 'accounting student', 'how', 'accounting']


Question 2
What is the difference between the certificates and associate in science (AS) degree in accounting? Accounting student at Towson university  #accounting


['university', 'certificates associate', 'as', 'as degree', 'towson university', 'degree accounting', 'degree', 'science', 'what difference', 'towson', 'difference', 'certificates', 'what', 'associate science', 'student', 'difference certificates', 'associate', 'accounting', 'science as', 'student towson', 'accounting student']
__________________________________________________


Jaccard Similarity Ranking: 2 Score: 0.

We can see a significant improvement here, but things are still lacking as we see "how many years does it take" is rated similar regardless of career path.