In [3]:
import os
import pandas as pd
import numpy as np

# TextRank

In [None]:
# https://towardsdatascience.com/understand-text-summarization-and-create-your-own-summarizer-in-python-b26a9f09fc70

In [85]:
eng_reviews_df = pd.read_json("preprocessed_english_reviews.json")
eng_reviews_df.tail(1)

Unnamed: 0,comments,comments_cleaned,comments_l,comments_token,comments_token_str,date,id,index,language,listing_id,probability,review_length,reviewer_id,reviewer_name,sentence_length
362299,The host canceled this reservation 4 days befo...,host canceled reservation days arrival automat...,the host canceled this reservation 4 days befo...,"[host, cancel, reserv, day, arriv, autom, post]",host cancel reserv day arriv autom post,2019-05-04,447965634,452636,en,34384353,0.958796,87,49082420,Maxime,14


In [36]:
eng_reviews_df = eng_reviews_df[~eng_reviews_df.comments.str.contains("The host canceled this reservation")]

# Extraction Method

## Using Tfidf

https://blog.floydhub.com/gentle-introduction-to-text-summarization-in-machine-learning/

In [37]:
from nltk.tokenize import sent_tokenize
eng_reviews_df['sentences'] = eng_reviews_df['comments'].apply(lambda x: sent_tokenize(x))

In [12]:
from datajanitor.text import create_ngram, remove_stopword, symbols_replaced, decontracted, remove_html_tags, split_words_and_punctuation, apply_text_normalisation
from utils import keep_token_pattern, chain

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

english_stop_words = stopwords.words('english')
sb = SnowballStemmer("english").stem

operations = [{"function": symbols_replaced},
              {"function": decontracted},
              {"function": remove_html_tags},
              {"function": split_words_and_punctuation},
              {"function": remove_stopword, "stopword_list": english_stop_words},
              {"function": keep_token_pattern, "pattern": '[a-zA-Z]{3,}'}
             ]

In [74]:
from sklearn.feature_extraction.text import TfidfVectorizer

def dummy_fun(token):
    return token

count_vectorizer = TfidfVectorizer(
                        strip_accents='unicode',
                        preprocessor=dummy_fun,
                        analyzer='word',
                        ngram_range=(1, 1),
                        min_df=10,
                        use_idf=True, smooth_idf=True, 
                        max_features = 1000)

bag_of_words = count_vectorizer.fit(eng_reviews_df['comments_token_str'])

In [39]:
eng_reviews_df['sentences_cleaned'] = eng_reviews_df['sentences'].apply(lambda x: [chain(sentence.lower(), operations) for sentence in x])

In [46]:
grouped_eng_reviews = eng_reviews_df.groupby('listing_id')

In [83]:
listing_2818 = grouped_eng_reviews.get_group(2818)
listing_2818_review = np.array([review_sentence for review_sentences in listing_2818['sentences_cleaned'].tolist() for review_sentence in review_sentences])
listing_2818_review_weights = [count_vectorizer.transform(sentence).sum(axis=0) if sentence else np.array(0) for sentence in listing_2818_review]
listing_2818_sentence_weights = np.array([w.sum() for w in listing_2818_review_weights])
top_3_sentences_pos_listing_2818 = listing_2818_sentence_weights.argsort()[-3:][::-1]
top_3_listing_2818_sentences = np.array(listing_2818_review)[top_3_sentences_pos_listing_2818]

In [84]:
[" ".join(top_sentence) for top_sentence in top_3_listing_2818_sentences]

['room comfortable colourful light quiet equipped everything could possibly need daniel flat spotless beautifully furnished great location daniel perfect host spending first hour arrival talking amsterdam answering many questions showing get around',
 'also nicely laid tea coffee set room different tour books amsterdam helped plan days plastic map could carry throughout adventures bicycle available rent matching set shampoo conditioner lotion bathroom etc adapter plugs',
 'daniel supplied electric water heater room could consume tea coffee leisure added laundry showed use hidden airbnb map real time supplied lanyard phone number never used reassuring']

In [91]:
listing_df = grouped_eng_reviews.get_group(82482)
get_listing_summary(listing_df, count_vectorizer)

['nice room negative thing roof light made hard sleep wasnt dark witch two younger children hot sun came really small bathroom hard help kids shower small shower',
 'loved stay shawna flat lightfull better put eyes cover clean website hidden airbnb central next grocery shops kind website hidden airbnb nice family let make reservation van gogh museum make cue',
 'friendly host family amazing appartement amsterdam next visit would definitely like live supermarket next door open lots pubs caf restaurants cash machine min walk leidsplein aawww airport wifi ofcourse stereo equipment left headphone connector cable plug ipod iphone whatever']

In [89]:
def get_listing_summary(listing_df, count_vectorizer, top_n_sentence=3):
    """
    Importantly, to ensure long sentences do not have unnecessarily high scores over short sentences, we divided each score of a sentence by the number of words found in that sentence.
    """
    listing_review = np.array([review_sentence for review_sentences in listing_df['sentences_cleaned'].tolist() for review_sentence in review_sentences])
    listing_review_weights = [count_vectorizer.transform(sentence).sum(axis=0) if sentence else np.array(0) for sentence in listing_review]
    listing_sentence_weights = np.array([w.sum() for w in listing_review_weights])
    top_3_sentences_pos_listing = listing_sentence_weights.argsort()[-1*top_n_sentence:][::-1]
    top_3_listing_sentences = [" ".join(sentence) for sentence in np.array(listing_review)[top_3_sentences_pos_listing]]
    return top_3_listing_sentences

In [None]:
def _calculate_average_score(sentence_weight) -> int:
   
    # Calculating the average score for the sentences
    sum_values = 0
    for entry in sentence_weight:
        sum_values += sentence_weight[entry]

    # Getting sentence average value from source text
    average_score = (sum_values / len(sentence_weight))

    return average_score

# Abstraction Method