# Web Mining Final Project
## Dalton J. Francis


In [21]:
import pickle
import requests
from bs4 import BeautifulSoup
import nltk
import numpy
from nl import score_sentences
from nltk.tokenize import sent_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
##nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('vader_lexicon')


def tf(term,doc):
    tokens = doc.lower().split()
    return tokens.count(term.lower())
    
def pullReviews(url):
    tempCritReviews=[]
    tempCritSents=[]
    tempAudiReviews=[]
    tempAudiSents=[]
    for i in range(3):
        r = requests.get(url+'?page='+str(i+1)+'&sort=')
        soup = BeautifulSoup(r.text,'html5lib')
        divs = soup.findAll('div',{"class":"the_review"})
        for rev in divs:
            tempCritReviews.append(rev.text) ##4 pages of reviews, 20 reviews per page
            tempCritSents.extend(sent_tokenize(rev.text))
    for i in range(3):
        r = requests.get(url+'?page='+str(i+1)+'&type=user')
        soup = BeautifulSoup(r.text,'html5lib')
        divs = soup.findAll('div',{"class":"user_review"})
        for rev in divs:
            tempAudiReviews.append(rev.text) 
            tempAudiSents.extend(sent_tokenize(rev.text))
    return tempCritReviews,tempCritSents,tempAudiReviews,tempAudiSents

def vocabCheck(reviews):
    words = list()
    stopwords = set(nltk.corpus.stopwords.words('english'))
    stopwords.add("'s")
    stopwords.add("n't")
    stopwords.add("``")
    stopwords.add("...")
    stopwords.add("''")
    for i in reviews:
        words.extend(map(lambda x: x.lower(),nltk.tokenize.word_tokenize(i)))
    for punctuation in ",.'?''`:;’”“`]['-": #the last three are unicode characters of weird quotes. (python is unicode aware)
        stopwords.add(punctuation) #adds those individual punctuation to the stopwords to be filtered
    freqDist = nltk.FreqDist(words)
    sorted_terms = sorted(freqDist.items(), key=lambda x: x[1], reverse=True)
    tenMostCommon = [word[0] for word in sorted_terms if word[0] not in stopwords][:10]
    return tenMostCommon

def sentimentCheck(reviews):
    analyzer = SentimentIntensityAnalyzer()
    avgPolarity=0
    positiveReviews=[]
    negativeReviews=[]
    for review in reviews:
        if(analyzer.polarity_scores(review)['pos']>analyzer.polarity_scores(review)['neg']):
            positiveReviews.append(review)
            avgPolarity+=analyzer.polarity_scores(review)['compound'] #if review is mostly positive, add positive value
        elif(analyzer.polarity_scores(review)['pos']<analyzer.polarity_scores(review)['neg']):
            negativeReviews.append(review)
            avgPolarity+=analyzer.polarity_scores(review)['compound'] #if review is mostly negative, add the inverse of the negative value

    avgPolarity=avgPolarity/len(reviews) #any review with exact neutral values would just add 0.
    print('pos list: ',len(positiveReviews))
    print('neg list: ',len(negativeReviews))
    print('avg pol: ',avgPolarity)

def summarize(sentences, important_words, cthresh=5, top_sentences=5):
    scores = score_sentences(sentences,important_words)
    avg = numpy.mean(scores)
    std_dev = numpy.std(scores)
    score_threshold = avg + 0.5 * std_dev
    mean_scored = [t[0] for t in enumerate(scores) if t[1] > score_threshold]
    sorted_scores = sorted(enumerate(scores),key=lambda x: x[1],reverse=True)[:top_sentences]
    sorted_indices = sorted([s[0] for s in sorted_scores])
    return {'top-n': ' '.join([sentences[i] for i in sorted_indices]),'mean-score': ' '.join([sentences[i] for i in mean_scored])}

#criticReviews,criticSents,audienceReviews,audienceSents=pullReviews('https://www.rottentomatoes.com/m/us_2019/reviews/')
#with open('criticReviews.pkl','wb') as f:
#    pickle.dump(criticReviews,f)
#with open('criticSents.pkl','wb') as f:
#    pickle.dump(criticSents,f)
#with open('audienceReviews.pkl','wb') as f:
#    pickle.dump(audienceReviews,f)
#with open('audienceSents.pkl','wb') as f:
#    pickle.dump(audienceSents,f)

with open('criticReviews.pkl','rb') as f:
    criticReviews = pickle.load(f)
with open('criticSents.pkl','rb') as f:
    criticSents = pickle.load(f)
with open('audienceReviews.pkl','rb') as f:
    audienceReviews = pickle.load(f)
with open('audienceSents.pkl','rb') as f:
    audienceSents = pickle.load(f)
    
#vocabCheck(criticSents)
#vocabCheck(audienceReviews)
#sentimentCheck(criticReviews)
#sentimentCheck(audienceReviews)
summarize(criticSents,vocabCheck(criticReviews))
summarize(audienceSents,vocabCheck(audienceReviews))

{'top-n': '  Garbage recycled stuff like get out.not impressed again sorry Peele.   "Us" is an ambitiously intriguing horror film that touches upon complex themes to ponder upon, has sharp direction from Jordan Peele and a remarkable performance from Lupita Nyong\'o.   Us is a amazing and truly scary horror movie.   A very creepy and scary horror movie brimming with ideas which will populate your mind with thoughts that won\'t leave you alone. Instead, \'US\' manages to keep the plot very straight forward, while being a mystery, it uses standard horror film situations but fills them with tension in just the right way.',
 'mean-score': 'Unlike most horror movies these days, this film is not a garbage jump scare heavy or-stay up all night terrified-kind of film. A much more thought provoking and overall entertaining film with a nice twist.   Really stupid movie honestly. The lead actress did her best, acted well. "Get out" got to his head and tried making this movie smarter than what it 