In [0]:
# Utilities
from tqdm.auto import tqdm
tqdm.pandas()
import numpy as np
import pandas as pd

# NLP
import spacy 
from spacy.lang.en.stop_words import STOP_WORDS

import nltk
from nltk import sent_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download("punkt")
nltk.download("vader_lexicon")

# Evaluation
from sklearn.metrics import accuracy_score,classification_report

PATH = "/content/drive/My Drive/sen_ana/"

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!




In [19]:
df = pd.read_csv(PATH+"candidate_hospitals_translated.csv")
df

Unnamed: 0,comment,score,hospital,en
0,เคยได้ไปลองใช้บริการคลีนิคนอกเวลาของรามาเมื่อต...,5,ramathibodi-hospital,"Ever were going to try to use the service, the..."
1,ผมเคยไปทำเลสิกที่นี้ ดีมากครับตอนนี้ผ่ามาจะ2ปี...,4,ramathibodi-hospital,I've been to Lasik the this good now. coming t...
2,รักษามาหลายปีแล้วตั้งแต่อายุ14 ตอนนี้จะจบปริญญ...,5,ramathibodi-hospital,Treatment for years since the age of 14 will n...
3,ที่ตึกพระเทพชั้น 2 สถานที่เครื่องมือดูทันสมัย ...,3,ramathibodi-hospital,"Building the floor 2. place tools, modern look..."
4,Its a public hospital so service is bad (nurse...,4,ramathibodi-hospital,Its a public hospital so service is bad (the n...
...,...,...,...,...
702,ค่ำวันที่ 26 ตุลาคม 2559 เหตุการณ์ในวันนั้นฉัน...,5,king-chulalongkorn-memorial-hospital,"Evening of October 26, 2559 event on that day,..."
703,ทำเพื่ออะไรรีวิวนี้แต่เราไปรักษารพจุฬาก็ได้รับ...,5,king-chulalongkorn-memorial-hospital,"Do what for this review, but we to maintain th..."
704,ทาง รพ. บริการคนไข้ดีมาก ใส่ใจผู้ป่วย เป็นกันเ...,5,king-chulalongkorn-memorial-hospital,"The hospital. Services, patient, very attentiv..."
705,หมอเก่ง,3,king-chulalongkorn-memorial-hospital,The doctor brilliantly


In [0]:
# Stopword removal and Lemmanization

nlp = spacy.load("en_core_web_sm")

def stop(text):
  cleaned_token = []
  doc = nlp(text) 

  for word in doc:
    if word.lemma_ not in STOP_WORDS:
      cleaned_token.append(word.lemma_)
  return cleaned_token

In [0]:
# cleaned in sentences

sent_clean = pd.DataFrame(df.en.progress_apply(lambda x: stop(x)))
sent_clean = pd.DataFrame(sent_clean.en.progress_apply(lambda x:' '.join(x)))
sent_clean["score"] = df.score
sent_clean["hospital"] = df.hospital

HBox(children=(FloatProgress(value=0.0, max=707.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=707.0), HTML(value='')))




In [0]:
# score 3 ignored, score 1,2 as bad, score 4,5 as good

sent_clean = sent_clean.loc[sent_clean.score != 3]
sent_clean["sen"] = 0
sent_clean.loc[sent_clean.score.isin([4,5]),["sen"]] = 1
sent_clean

Unnamed: 0,en,score,hospital,sen
0,"try use service , clinic , Rama beginning trea...",5,ramathibodi-hospital,1
1,"-PRON- Lasik good . come 2 year , price good m...",4,ramathibodi-hospital,1
2,"treatment year age 14 finish -PRON- degree , s...",5,ramathibodi-hospital,1
4,-PRON- public hospital service bad ( nurse com...,4,ramathibodi-hospital,1
5,-PRON- friend disease pulmonary leak doctor ca...,5,ramathibodi-hospital,1
...,...,...,...,...
701,"large hospital , lot people , doctor nurse , g...",5,king-chulalongkorn-memorial-hospital,1
702,"evening October 26 , 2559 event day , -PRON- r...",5,king-chulalongkorn-memorial-hospital,1
703,"review , -PRON- maintain CHULA treat , . today...",5,king-chulalongkorn-memorial-hospital,1
704,"hospital . service , patient , attentive , pat...",5,king-chulalongkorn-memorial-hospital,1


In [0]:
sent_clean.to_csv(PATH+"sentence_clean.csv",index=False)

In [0]:
def comment_polarity(df):
  sa = SentimentIntensityAnalyzer()
  polar = []
  for i in df.index:
    # sentiment per sentence of each comment
    for sentence in sent_tokenize(df.loc[i, 'en']):
      sentiment_distribution = sa.polarity_scores(sentence)
      score = sentiment_distribution.get("compound")
      # total of the comment
      score += score
    if score >=0:
      polar.append(1)
    else:
      polar.append(0)
  return polar

In [0]:
polar = comment_polarity(sent_clean)

In [0]:
accuracy_score(sent_clean.sen,polar)

0.797191887675507

In [0]:
print(classification_report(sent_clean.sen,polar))

              precision    recall  f1-score   support

           0       0.49      0.39      0.43       127
           1       0.86      0.90      0.88       514

    accuracy                           0.80       641
   macro avg       0.67      0.64      0.65       641
weighted avg       0.78      0.80      0.79       641

