In [1]:
import pandas as pd
from rapidfuzz import fuzz
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from collections import Counter
import swifter

In [2]:
def cleanliness_count(review,word_set):
        if word_set:
            word_count = Counter(review)
            for word in word_set:
                if word in word_count:
                    return 1
        else:
             return 0
def cleanliness_count_fuzzy(review,word_set):
    for word in review:
         for category_word in word_set:
              if(fuzz.ratio(category_word,word) >= 80):
                 return 1
    return 0
  
    
def get_sentiment(listing_id, reviews_df, category,word_set=None):
    sid = SentimentIntensityAnalyzer()  
    # Filter reviews for the given listing_id
    listing_reviews = reviews_df[reviews_df['listing_id'] == listing_id].copy()
    if word_set:
        listing_reviews.loc[:,'cleanliness_count'] = listing_reviews['review'].apply(
             lambda x: cleanliness_count(x.split(),word_set=word_set)
        )
        reviews = listing_reviews[listing_reviews['cleanliness_count'] > 0]
    else:
        reviews = listing_reviews

    if not reviews.empty:
        sentiment = reviews.apply(
            lambda row: (sid.polarity_scores(row['review'])['compound']), axis=1
        )
        return pd.Series({f'median_sentiment_{category}': sentiment.median(), 
                f'average_sentiment_{category}': sentiment.mean()})
    else:
        return pd.Series({f'median_sentiment_{category}': 0, 
                f'average_sentiment_{category}': 0})


In [3]:
with open('./Dictionaries/cleanliness_words.txt','r') as f:
    clean_words = [line.strip().lower() for line in f.readlines()]
with open('./Dictionaries/accuracy_words.txt','r') as f:
    accuracy_words = [line.strip().lower() for line in f.readlines()]
with open('./Dictionaries/checkin_words.txt','r') as f:
    checkin_words = [line.strip().lower() for line in f.readlines()]
with open('./Dictionaries/communication_words.txt','r') as f:
    communication_words = [line.strip().lower() for line in f.readlines()]
with open('./Dictionaries/location_words.txt','r') as f:
    location_words = [line.strip().lower() for line in f.readlines()]
with open('./Dictionaries/value_words.txt','r') as f:
    value_words = [line.strip().lower() for line in f.readlines()]

df = pd.read_csv('./Data/combined_gt20_words_en_only.csv')
df.dropna(subset=['comments'],inplace=True)
df['comments'] = df['comments'].apply(lambda x: x.split('``'))
df.convert_dtypes()

Unnamed: 0,listing_id,comments,review_count,name,description,neighborhood_overview,host_since,host_about,host_response_time,host_response_rate,...,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,44077,[we enjoyed our stay very much the room was co...,257,cosy comfortable irish cottage twin,our house is a cosy comfortable cottage with a...,i like our neighbourhood as there is no shorta...,5112,i am a friendly outgoing irishwoman who loves ...,within an hour,0.96,...,4.92,4.92,4.67,4.82,0,2,0,2,0,2.11
1,85156,[teresa and family were lovely hosts they were...,211,cosy comfortable irish cottage 1 double bed,our cottage is a charming light filled cottage...,i love dundrum and its surrounding areas there...,5112,i am a friendly outgoing irishwoman who loves ...,within an hour,0.96,...,4.91,4.89,4.65,4.78,0,2,0,2,0,1.9
2,159889,[margaret is very helpful and attentive she an...,340,friendly single room,washing can be done at a cost of 5 per load a ...,plenty of buses into the city and the area is ...,4783,used to host students and now i am looking for...,within an hour,1.0,...,4.84,4.87,4.64,4.71,0,3,0,3,0,2.9
3,162809,[robbie was very welcoming and we had a very g...,290,5.0,a nice place to relax after the bustle of the ...,close to the sea hill walks and the city via b...,4780,i live here on the third floor of my house the...,within a few hours,0.87,...,4.94,4.97,4.78,4.85,0,2,0,2,0,3.52
4,165828,[we had a wonderful time staying in grainne s ...,52,p draig pearse apt kilmainham,don t just visit dublin experience dublin in a...,enjoy a walk along the grand canal or just obs...,4777,i m gr inne originally from co kildare and rai...,within a day,0.75,...,4.83,4.76,4.39,4.54,0,4,4,0,0,0.49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4508,1173346250133533628,[guilherme was a great host his apartment is s...,3,luxury room on the heart,gorgeous and spacious apartment at dublin city...,,1308,,within an hour,1.0,...,5.0,5.0,5.0,5.0,0,2,1,1,0,3.0
4509,1174669132225502884,[deeply recommend liliane is a wonderful perso...,1,cosy room in duplex raheny,cosy single room in a 100 meter square suplex ...,,2720,hi lovely to meet you i am liliane i have ...,within an hour,1.0,...,5.0,5.0,5.0,5.0,0,2,0,2,0,1.0
4510,1175337031194584866,[5.0],1,room in a family home dublin near tram station,about us we are a friendly family of three inc...,,956,,within an hour,1.0,...,5.0,5.0,5.0,5.0,0,1,0,1,0,1.0
4511,1176027794787210610,[thank you blas everything went perfect house ...,1,beautiful rom dublin 8,enjoy a stylish experience at this centrally l...,,857,,within a day,0.64,...,5.0,5.0,5.0,5.0,0,1,0,1,0,1.0


In [4]:
clean_words = list(set(clean_words))
accuracy_words = list(set(accuracy_words))
checkin_words = list(set(checkin_words))
communication_words = list(set(communication_words))
location_words = list(set(location_words))
value_words = list(set(value_words))
category_list = {
    'clean':clean_words,
    'accurate':accuracy_words,
    'checkin':checkin_words,
    'communication':communication_words,
    'location':location_words,
    'value':value_words,
    'overall':None,
}

In [5]:
reviews_df = pd.DataFrame(columns=['listing_id','review'])

reviews_data = [
    {'listing_id': row[1]['listing_id'], 'review': review} for row in df.iterrows() for review in row[1]['comments']
]

reviews_df = pd.DataFrame(reviews_data)

In [6]:
def calculate_sentiments(key, value, df, reviews_df):
    df[[f'median_sentiment_{key}', f'average_sentiment_{key}']] = df['listing_id'].apply(
        lambda id: get_sentiment(id, reviews_df=reviews_df, category=key, word_set=value)
    )
    return df

# with concurrent.futures.ThreadPoolExecutor() as executor:
#     futures = [executor.submit(calculate_sentiments, key, value, df, reviews_df) for key, value in category_list.items()]
#     for future in concurrent.futures.as_completed(futures):
#         df = future.result()

for key,value in category_list.items():
    df[[f'median_sentiment_{key}',f'average_sentiment_{key}']] = df['listing_id'].swifter.apply(
        lambda id: get_sentiment(id, word_set=value, reviews_df=reviews_df,category=key)
    )


Dask Apply:   0%|          | 0/25 [00:00<?, ?it/s]

  return getattr(__obj, self.method)(*args, **kwargs)
  return getattr(__obj, self.method)(*args, **kwargs)
  return getattr(__obj, self.method)(*args, **kwargs)
  return getattr(__obj, self.method)(*args, **kwargs)
  return getattr(__obj, self.method)(*args, **kwargs)
  return getattr(__obj, self.method)(*args, **kwargs)
  return getattr(__obj, self.method)(*args, **kwargs)
  return getattr(__obj, self.method)(*args, **kwargs)
  return getattr(__obj, self.method)(*args, **kwargs)
  return getattr(__obj, self.method)(*args, **kwargs)
  return getattr(__obj, self.method)(*args, **kwargs)
  return getattr(__obj, self.method)(*args, **kwargs)
  return getattr(__obj, self.method)(*args, **kwargs)
  return getattr(__obj, self.method)(*args, **kwargs)
  return getattr(__obj, self.method)(*args, **kwargs)
  return getattr(__obj, self.method)(*args, **kwargs)
  return getattr(__obj, self.method)(*args, **kwargs)
  return getattr(__obj, self.method)(*args, **kwargs)
  return getattr(__obj, self

In [None]:
df.to_csv('./Data/cleaned_with_sentiment_scores.csv',index=False,encoding='utf-8')

In [None]:
df[['review_count','median_sentiment_accurate','review_scores_rating','review_scores_accuracy']].head()