# Generate Sentiment for subsetting data

This notebook generates sentiment scores for the raw data. These scores are used to subset the data so there is less data to manually label.


In [None]:
GEO = "texas"

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import pandas as pd

reviews = pd.read_csv(f"../../data/raw/{GEO}_reviews.csv")
reviews.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,2265,963,2009-03-17,7538,Niall,I stayed here during SXSW and had a really ple...
1,2265,1057,2009-03-22,10029,Michael,"Great place, close enough to everything downto..."
2,2265,200418,2011-03-16,61677,Gustaf,We had a great time in Austin staying at Paddy...
3,2265,1001630,2012-03-15,1523753,Noah,We had a great stay at Zen East for South By S...
4,2265,1016390,2012-03-19,1547660,Melissa,I arrived late in the evening so did not meet ...


In [6]:
def get_sentiment(row):
    comment = row['comments']
    print(row.name)
    try:
        if len(comment) > 512:
            segments = [comment[i:i + 512] for i in range(0,len(comment),512)]
        else:
            segments = [comment]
        preds = []
        for segment in segments:
            tokens = tokenizer.encode(segment, return_tensors='pt')
            result = model(tokens)
            pred = int(torch.argmax(result.logits))+1
            preds.append(pred)
    
        return sum(preds)/len(preds)
    except:
        print("error")


sentiment_score = reviews.head(10).apply(get_sentiment,axis = 1)
reviews['sentiment'] = sentiment_score
reviews.to_csv("../../data/sentiment/texas_w_sentiment.csv",index = False)
reviews.describe()

0
1
2
3
4
5
6
7
8
9


Unnamed: 0,listing_id,id,reviewer_id,sentiment
count,332098.0,332098.0,332098.0,10.0
mean,12759560.0,250425300.0,73304800.0,4.6
std,9806012.0,143481400.0,68746640.0,0.875595
min,2265.0,865.0,3.0,2.5
25%,3403516.0,130573900.0,17071550.0,5.0
50%,12260870.0,251784600.0,50448840.0,5.0
75%,20755990.0,365817300.0,115569500.0,5.0
max,36514850.0,485948600.0,274857500.0,5.0
