# Dependency Handling

In [1]:
# Install pytorch

!pip3 install torch torchvision torchaudio



In [2]:
# Install Transformers (using for NLP model - allows for easily import model)
# Install Requests (allow requests to webpages for analysis)
# Install BeautifulSoup (extract relevant data from webpage)
# Install Pandas and Numpy (structure data)

!pip3 install requests transformers beautifulsoup4 pandas numpy



In [3]:
# Tokenizer - Parse through string to get numbers.
# AutoModelForSequenceClassification - Give us architecture from transformers to load in NLP model
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import pandas as pd

# Setup Model (Initialization)

In [4]:
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

# Encoding and Sentiment Calculation

In [5]:
tokens = tokenizer.encode('I loved taking 482X with Dr. Duwe!', return_tensors='pt')

In [6]:
# Debug

# tokens
# tokens[0]
# tokenizer.decode(tokens[0])

In [7]:
result = model(tokens)

In [8]:
result

SequenceClassifierOutput(loss=None, logits=tensor([[-2.1400, -2.0940, -0.7388,  1.3721,  2.8214]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [9]:
star_rating = int(torch.argmax(result.logits)) + 1
print('Star Rating: ', star_rating)

Star Rating:  5


#  Parse Website

In [10]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36'}
req = requests.get('https://www.niche.com/colleges/iowa-state-university/reviews/', headers=headers)
soup = BeautifulSoup(req.text, 'html.parser')
regex = re.compile('reviewBody')
result_website = soup.find_all('div', {'itemprop':regex})
reviews = [i.text for i in result_website]

# Debug
#reviews

# Load Website Data into Dataframe and Get Star Rating

In [11]:
df = pd.DataFrame(np.array(reviews), columns=['string'])

# Debug
#soup
#result_website
#reviews
#df['string'].iloc[1]

In [12]:
def StringToScore(string):
    tokens = tokenizer.encode(string, return_tensors='pt')
    output = model(tokens)
    return int(torch.argmax(output.logits)) + 1
    

In [13]:
# Debug
StringToScore(df['string'].iloc[0])

5

In [14]:
df['sentiment'] = df['string'].apply(lambda x: StringToScore(x[:1024]))

In [15]:
df

Unnamed: 0,string,sentiment
0,I am only a high school freshman but I am look...,5
1,"Great campus! Helpful professors, and most imp...",5
2,Iowa state doesn’t care about its students. Fo...,1
3,There are numerous things to love about Iowa S...,5
4,Iowa State University is a great community tha...,4
5,"It is a great school with great education, but...",3
6,"It is a good value school, compared with other...",4
7,Teachers are a bit harsh and don't seem to car...,4
8,Terrible university in everything. I did not g...,1
9,Overall Iowa State has been a positive experie...,4


In [16]:
print("Average Sentiment: ", df['sentiment'].mean())

Average Sentiment:  4.15
