# Text Based Sentiment Analysis

# IMPORTING NECESSARY MODULES

In [1]:
import numpy as np # For linear algebra
import pandas as pd # Data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt  # For Visualisation
%matplotlib inline
import seaborn as sns  # For Visualisation
from bs4 import BeautifulSoup  # For Text Parsing
from ydata_profiling import ProfileReport  # For generating data report

# IMPORTING DATASET

In [2]:
data = pd.read_csv('./train.csv')
data.head(15)

Unnamed: 0,conversation_id,utterance_id,speaker,text,saliency
0,742555640001,0,agent,"May I know your name, please?",
1,742555640001,1,customer,"Uh yes, my name is {full_name:***** ****}.",
2,742555640001,2,agent,", {first_name:*****}. How's your day going, {f...",
3,742555640001,3,customer,It's going good. Thank you for asking.,
4,742555640001,4,agent,That's great. So how can I help you?,
5,742555640001,5,agent,mhm.,
6,742555640001,6,customer,"Uh yes, sir. I'm a new Raven member and I was ...",Reason for Contact
7,742555640001,7,agent,mhm.,
8,742555640001,8,agent,"Yes, sir.",
9,742555640001,9,customer,That has to be canceled by the 23rd. It had a ...,Reason for Contact


## Data Visualization

In [3]:
profile = ProfileReport(data, title="Saliency Data Profiling Report")
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



## Data Preparation

In [4]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()
data['rich_text'] = data['text'].apply(strip_html)

# data=data.drop('Text',axis=1)
data

Unnamed: 0,conversation_id,utterance_id,speaker,text,saliency,rich_text
0,742555640001,0,agent,"May I know your name, please?",,"May I know your name, please?"
1,742555640001,1,customer,"Uh yes, my name is {full_name:***** ****}.",,"Uh yes, my name is {full_name:***** ****}."
2,742555640001,2,agent,", {first_name:*****}. How's your day going, {f...",,", {first_name:*****}. How's your day going, {f..."
3,742555640001,3,customer,It's going good. Thank you for asking.,,It's going good. Thank you for asking.
4,742555640001,4,agent,That's great. So how can I help you?,,That's great. So how can I help you?
...,...,...,...,...,...,...
774,742579770001,128,customer,I.,,I.
775,742579770001,129,agent,mhm.,,mhm.
776,742579770001,130,agent,A.,,A.
777,742579770001,131,customer,Ma'am?,,Ma'am?


In [7]:
import nltk  #Natural Language Processing Toolkit
def punc_clean(text):
    import string as st
    a=[w for w in text if w not in st.punctuation]
    return ''.join(a)
data[''] = data['Extracted text'].apply(punc_clean)
#data.head(2)

In [8]:
def remove_stopword(text):
    stopword=nltk.corpus.stopwords.words('english')
    stopword.remove('not')
    a=[w for w in nltk.word_tokenize(text) if w not in stopword]
    return ' '.join(a)
data['Extracted text'] = data['Extracted text'].apply(remove_stopword)

In [6]:
data = pd.read_csv('train-cleaned.csv')
data

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,conversation_id,utterance_id,speaker,text,saliency,rich_text,Extracted text
0,0,0,742555640001,0,agent,"May I know your name, please?",0,"May I know name , please ?",May I know your name please
1,1,1,742555640001,1,customer,"Uh yes, my name is {full_name:***** ****}.",0,"Uh yes , name { full_name : * * * * * * * * * } .",Uh yes my name is fullname
2,2,2,742555640001,2,agent,", {first_name:*****}. How's your day going, {f...",0,", { first_name : * * * * * } . How 's day goin...",firstname Hows your day going firstname
3,3,3,742555640001,3,customer,It's going good. Thank you for asking.,0,It 's going good . Thank asking .,Its going good Thank you for asking
4,4,4,742555640001,4,agent,That's great. So how can I help you?,0,That 's great . So I help ?,Thats great So how can I help you
...,...,...,...,...,...,...,...,...,...
774,774,774,742579770001,128,customer,I.,0,I .,I
775,775,775,742579770001,129,agent,mhm.,0,mhm .,mhm
776,776,776,742579770001,130,agent,A.,0,A .,A
777,777,777,742579770001,131,customer,Ma'am?,0,Ma'am ?,Maam


## Model building

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectr = TfidfVectorizer(ngram_range=(1,2),min_df=1)
vectr.fit(data['Extracted text'])

vect_X = vectr.transform(data['Extracted text'])

In [10]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

svm_classifier = SVC(kernel='linear', probability=True)
logistic_classifier = LogisticRegression()


model = VotingClassifier(estimators=[
    ('svm', svm_classifier),
    ('logistic', logistic_classifier)
], voting='hard')

clf=model.fit(vect_X,data['saliency'])

In [11]:
import requests
import time
import json
from profanityfilter import ProfanityFilter

model_name = "Dabid/abusive-tagalog-profanity-detection"
endpoint = f"https://api-inference.huggingface.co/models/Dabid/abusive-tagalog-profanity-detection"

input_text = "like"
payload = {
    "inputs": input_text,
    "options": {
        "max_length": 50,
        "temperature": 1,
    },
}

pf = ProfanityFilter()
# print(pf.is_profane(input_text))


headers = {"Authorization": "Bearer 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"}
response = requests.post(endpoint, json=payload, headers=headers)
if response.status_code == 200:
    result = json.loads(response.text)
    filtered_text = result[0]
    # print(filtered_text)
    # print("Filtered Text:")
    p = filtered_text[0]["label"]
    val = filtered_text[0]["score"]
    prof = ''
    if p == "Non-Abusive" and val > 0.75:
        prof = "Non Profane"
    else:
        prof = "Non Profane"
elif "Model is currently loading" in response.text:
    print("Model is still loading. Retrying in a few seconds...")
    time.sleep(20)
else:
    print("API call failed with status code:", response.status_code)
    # print(response.text)

# PREDICTION

In [12]:
def predict(text):
    senti = clf.predict(vectr.transform([text]))
    if(pf.is_profane(text)):
        prof = True
        censored_text = pf.censor(text)
    else:
        prof = False
        censored_text = pf.censor(text)
    
    if (int(senti)):
        text_sent = "Salient"
    else:
        text_sent = "Not salient"

    return {
        "salient": text_sent,
        "profanity": prof,
        "censored_text": censored_text
    }

In [13]:
import pandas as pd
# data = pd.read_csv('./eval.csv')

ans = []
salient = []
non_salient = []
profane = []
non_profane = []

for i in data['Extracted text']:
	result = predict(i)
	if(result['salient'] == "Salient"):
		salient.append(i)
		ans.append(i)
	else:
		non_salient.append(i)
		ans.append(i)
	if(result['profanity'] == True):
		profane.append(i)
		ans.append(i)
	else:
		non_profane.append(i)
	# print(i)

In [14]:
clf.predict(vectr.transform(['''uhhh....''']))

array([1])

In [15]:
clf.predict(vectr.transform(['''mhm . Let check happening .''']))

array([1])

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=4d282b10-3250-4323-8e4d-a3cd11fe8ba3' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>