# Dependency Handling

In [1]:
# Install pytorch

!pip3 install torch torchvision torchaudio
#!pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113



In [2]:
# Install Transformers (using for NLP model - allows for easily import model)
# Install Requests (allow requests to webpages for analysis)
# Install BeautifulSoup (extract relevant data from webpage)
# Install Pandas and Numpy (structure data)

!pip3 install requests transformers beautifulsoup4 pandas numpy



In [3]:
# Tokenizer - Parse through string to get numbers.
# AutoModelForSequenceClassification - Give us architecture from transformers to load in NLP model
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('max_colwidth', 400)

  from .autonotebook import tqdm as notebook_tqdm


# Setup Model (Initialization)

In [None]:
# Multilingual dataset -- based on latin scripts
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

# Arabic dataset -- based on abjad scripts
sa = pipeline('text-classification', model='CAMeL-Lab/bert-base-arabic-camelbert-da-sentiment')
sa1 = pipeline('sentiment-analysis', model='CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment')
sa2 = pipeline('sentiment-analysis', model='CAMeL-Lab/bert-base-arabic-camelbert-ca-sentiment')
sa3 = pipeline('sentiment-analysis', model='CAMeL-Lab/bert-base-arabic-camelbert-mix-sentiment')
sentences = ['أنا بخير']
tmp = sa(sentences)

In [None]:

len(tmp)
tmp
tmp[0]['label']

# Encoding and Sentiment Calculation

In [None]:
tokens0 = tokenizer.encode('I am having a horrible day', return_tensors='pt') 
tokens1 = tokenizer.encode('Yay!!', return_tensors='pt') # Means "happy" 
tokens2 = tokenizer.encode('اِک شخص بددُعا سی کر گیا مُجھ پر, وُہ عہدِ ترقِ وفَاسِی کرگیا مُجھ پر', return_tensors='pt') # Negative sentence
tokens0

In [None]:
# Debug

# tokens
# tokens[0]
# tokenizer.decode(tokens[0])

In [None]:
result0 = model(tokens0)
result1 = model(tokens1)
result2 = model(tokens2)

In [None]:
print(result0)
print(result1)
print(result2)

In [None]:
star_rating0 = int(torch.argmax(result0.logits)) + 1
star_rating1 = int(torch.argmax(result1.logits)) + 1
star_rating2 = int(torch.argmax(result2.logits)) + 1

print('Star Rating (result0): ', star_rating0)
print('Star Rating (result1): ', star_rating1)
print('Star Rating (result2): ', star_rating2)


#  Parse Website

In [None]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36'}
req_sad = requests.get('https://hamariweb.com/poetries/sad_poetries2.aspx', headers=headers)
req_funny = requests.get('https://hamariweb.com/poetries/funny_poetries4.aspx', headers=headers)

soup_sad = BeautifulSoup(req_sad.text, 'html.parser')
soup_funny = BeautifulSoup(req_funny.text, 'html.parser')

regex_sad = re.compile('UrduText')
regex_funny = re.compile('UrduText')

result_website_sad = soup_sad.find_all('a', {'class':regex_sad})
result_website_funny = soup_funny.find_all('a', {'class':regex_funny})

reviews_sad = [i.text for i in result_website_sad]
reviews_funny = [i.text for i in result_website_funny]

# Debug
#reviews_sad
#reviews_funny


# Load Website Data into Dataframe and Get Star Rating

In [None]:
df_sad = pd.DataFrame(np.array(reviews_sad), columns=['string'])
df_funny = pd.DataFrame(np.array(reviews_funny), columns=['string'])

# Debug
#soup
#result_website
#reviews
#df['string'].iloc[1]

In [None]:
def StringToScore(string):
    tokens = tokenizer.encode(string, return_tensors='pt')
    output = model(tokens)
    return int(torch.argmax(output.logits)) + 1

def StringToScore_Ar(string):
    output = sa3(string)
    return output
    

In [None]:
# Debug
#StringToScore(df_sad['string'].iloc[0])

In [None]:
df_sad['sentiment'] = df_sad['string'].apply(lambda x: StringToScore(x[:512]))
#df_sad['string'] = df_sad['string'].replace('\n','')
df_funny['sentiment'] = df_funny['string'].apply(lambda x: StringToScore(x[:512]))

In [None]:
df_sad

In [None]:
df_funny

In [None]:
print("Average Sentiment (Sad Poetry): ", df_sad['sentiment'].mean())

In [None]:
print("Average Sentiment (Funny Poetry): ", df_funny['sentiment'].mean())

In [None]:
#np.savetxt(r'text_sad.txt', df_sad['string'].values, fmt='%s', encoding='utf-8')
#np.savetxt(r'text_funny.txt', df_funny['string'].values, fmt='%s', encoding='utf-8')

# Create non-diacritized and diacritized dataframe

In [None]:
df_funny_final = pd.read_csv("text_funny.txt", sep="\\n", names=['ND String'], encoding='utf8')
df_funny_final['ND Sentiment Label'] = df_funny_final['ND String'].apply(lambda x: StringToScore_Ar(x[:1024])[0]['label'])
df_funny_final['ND Sentiment Score'] = df_funny_final['ND String'].apply(lambda x: StringToScore_Ar(x[:1024])[0]['score'])
temp = pd.read_csv("predictions_d3_funny.txt", sep="\\n", names=['D String'], encoding='utf8')
df_funny_final['D String'] = temp['D String']
df_funny_final['D Sentiment Label'] = df_funny_final['D String'].apply(lambda x: StringToScore_Ar(x[:1024])[0]['label'])
df_funny_final['D Sentiment Score'] = df_funny_final['D String'].apply(lambda x: StringToScore_Ar(x[:1024])[0]['score'])

In [None]:
df_funny_final

In [None]:
df_sad_final = pd.read_csv("text_sad.txt", sep="\\n", names=['ND String'], encoding='utf8')
df_sad_final['ND Sentiment Label'] = df_sad_final['ND String'].apply(lambda x: StringToScore_Ar(x[:1024])[0]['label'])
df_sad_final['ND Sentiment Score'] = df_sad_final['ND String'].apply(lambda x: StringToScore_Ar(x[:1024])[0]['score'])
#df_sad_final['ND Sentiment Score'] = df_sad_final['ND String'].apply(lambda x: StringToScore_Ar(x[:1024]))
temp = pd.read_csv("predictions_d3_sad.txt", sep="\\n", names=['D String'], encoding='utf8')
df_sad_final['D String'] = temp['D String']
df_sad_final['D Sentiment Label'] = df_sad_final['D String'].apply(lambda x: StringToScore_Ar(x[:1024])[0]['label'])
df_sad_final['D Sentiment Score'] = df_sad_final['D String'].apply(lambda x: StringToScore_Ar(x[:1024])[0]['score'])

In [None]:
df_sad_final

# Generate plots

In [None]:
df_funny_final.plot(subplots=True, figsize=(6, 6)); plt.legend(loc='best')