# Dependency Handling

In [1]:
# Install pytorch

!pip3 install torch torchvision torchaudio
#!pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113



In [2]:
# Install Transformers (using for NLP model - allows for easily import model)
# Install Requests (allow requests to webpages for analysis)
# Install BeautifulSoup (extract relevant data from webpage)
# Install Pandas and Numpy (structure data)

!pip3 install requests transformers beautifulsoup4 pandas numpy



In [3]:
# Tokenizer - Parse through string to get numbers.
# AutoModelForSequenceClassification - Give us architecture from transformers to load in NLP model
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


# Setup Model (Initialization)

In [4]:
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

# Encoding and Sentiment Calculation

In [17]:
tokens0 = tokenizer.encode('I am having a great day', return_tensors='pt') 
tokens1 = tokenizer.encode('خوش', return_tensors='pt') # Means "happy" 
tokens2 = tokenizer.encode('اِک شخص بددُعا سی کر گیا مُجھ پر, وُہ عہدِ ترقِ وفَاسِی کرگیا مُجھ پر', return_tensors='pt') # Negative sentence


In [18]:
# Debug

# tokens
# tokens[0]
# tokenizer.decode(tokens[0])

In [19]:
result0 = model(tokens0)
result1 = model(tokens1)
result2 = model(tokens2)

In [20]:
print(result0)
print(result1)
print(result2)

SequenceClassifierOutput(loss=None, logits=tensor([[-2.1343, -1.9140, -0.0362,  1.5475,  1.9454]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
SequenceClassifierOutput(loss=None, logits=tensor([[-0.0923, -0.5614, -0.0990, -0.1329,  0.6934]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
SequenceClassifierOutput(loss=None, logits=tensor([[-1.4725, -1.1947,  0.2650,  1.0080,  1.1522]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [21]:
star_rating0 = int(torch.argmax(result0.logits)) + 1
star_rating1 = int(torch.argmax(result1.logits)) + 1
star_rating2 = int(torch.argmax(result2.logits)) + 1

print('Star Rating (result0): ', star_rating0)
print('Star Rating (result1): ', star_rating1)
print('Star Rating (result2): ', star_rating2)


Star Rating (result0):  5
Star Rating (result1):  5
Star Rating (result2):  5


#  Parse Website

In [10]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36'}
req = requests.get('https://www.niche.com/colleges/iowa-state-university/reviews/', headers=headers)
soup = BeautifulSoup(req.text, 'html.parser')
regex = re.compile('reviewBody')
result_website = soup.find_all('div', {'itemprop':regex})
reviews = [i.text for i in result_website]

# Debug
#reviews

# Load Website Data into Dataframe and Get Star Rating

In [11]:
df = pd.DataFrame(np.array(reviews), columns=['string'])

# Debug
#soup
#result_website
#reviews
#df['string'].iloc[1]

In [12]:
def StringToScore(string):
    tokens = tokenizer.encode(string, return_tensors='pt')
    output = model(tokens)
    return int(torch.argmax(output.logits)) + 1
    

In [13]:
# Debug
StringToScore(df['string'].iloc[0])

5

In [14]:
df['sentiment'] = df['string'].apply(lambda x: StringToScore(x[:1024]))

In [15]:
df

Unnamed: 0,string,sentiment
0,"Iowa State not only has a beautiful campus, bu...",5
1,Can't wait to start! I am going to be studying...,5
2,Very accepting and friendly environment. Lots ...,5
3,I visited the college for an engineering schol...,5
4,I love Iowa State. It has a big campus feel wi...,5
5,I have loved my experience here at Iowa state ...,5
6,It seems like a good school i wish i could go ...,4
7,"Overall, a good experience. Lots of clubs, org...",4
8,coming to iowa state has been the best decisio...,5
9,The campus is the perfect size. It is really e...,4


In [16]:
print("Average Sentiment: ", df['sentiment'].mean())

Average Sentiment:  4.45
