# How to load data, and loading times

In [3]:
import pandas as pd
import time
pd. __version__

'1.5.3'

## Load listings
For the listings, we can open the xlsx in Excel and convert the "listings" worksheet to a csv so it can then be loaded into Pandas no problem.

In [4]:

print("Loading listings...")

start = time.time()

print("Texas...")
texas_listings = pd.read_csv("../data/raw/texas_listings.csv", encoding="unicode_escape", low_memory=False)
print("Elapsed time: ", time.time() - start)

print("Florida...")
florida_listings = pd.read_csv("../data/raw/florida_listings.csv", encoding="unicode_escape", low_memory=False)
print("Elapsed time: ", time.time() - start)

print("California...")
california_listings = pd.read_csv("../data/raw/california_listings.csv", encoding="unicode_escape", low_memory=False)
print("Elapsed time: ", time.time() - start)

print("\nShapes of data:")
print("Texas: ", texas_listings.shape)
print("Florida: ", florida_listings.shape)
print("California: ", california_listings.shape)

Loading listings...
Texas...
Elapsed time:  1.2780261039733887
Florida...
Elapsed time:  2.385867118835449
California...
Elapsed time:  5.587490558624268

Shapes of data:
Texas:  (11882, 106)
Florida:  (10229, 106)
California:  (366643, 6)


## Load and Process Reviews

The reviews have newlines, tabs, and returns in the text fields, which corrupts the CSV format if you export directly from excel. Therefore, we need to clean the data before we can load it. We'll then export the results to a csv so we don't need to do this every time.

In [5]:
import pandas as pd

def clean_comment(comment: str) -> str:
    """Clean comment by removing line endings.
    Args:
        comment (str): The comment to clean.

    Returns:
        str: The cleaned comment.
    """

    if not isinstance(comment, str):
        return ""

    return comment.replace("\r", " ").replace("\n", " ").replace("\t", " ").replace("_x000D_", " ").strip()

def process_reviews(fp, target_fp, target_worksheet="Reviews"):
    """Process reviews from excel file and write to csv file for later, faster loading.

    Args:
        fp (str): The fp to the xlsx.
        target_fp (str): Target csv file path.
        target_worksheet (str, optional): The worksheet name to read from. Defaults to "Reviews".
    """

    print("Reading", fp)
    reviews = pd.read_excel(fp, sheet_name=target_worksheet)

    print("Removing line endings")
    reviews.comments = reviews.comments.apply(clean_comment)

    print("Writing to", target_fp)
    with open(target_fp, "w", encoding="utf-8") as f:
        f.write(reviews.to_csv(index=False, lineterminator='\r\n'))

In [6]:
process_reviews("../data/raw/TX Austin - All Data.xlsx", "../data/raw/texas_reviews.csv")

Reading ../data/raw/TX Austin - All Data.xlsx
Removing line endings
Writing to ../data/raw/texas_reviews.csv


In [7]:
process_reviews("../data/raw/CA San Francisco - All Data.xlsx", "../data/raw/california_reviews.csv")

Reading ../data/raw/CA San Francisco - All Data.xlsx
Removing line endings
Writing to ../data/raw/california_reviews.csv


In [8]:
process_reviews("../data/raw/FL Fort Lauderdale - All Data.xlsx", "../data/raw/florida_reviews.csv", target_worksheet="reviews")

Reading ../data/raw/FL Fort Lauderdale - All Data.xlsx
Removing line endings
Writing to ../data/raw/florida_reviews.csv


Let's check that the new CSVs are valid by loading them into Pandas and check the runtimes.

In [9]:
now = time.time()

print("Loading reviews...")
print("Texas...")
texas_reviews = pd.read_csv("../data/raw/texas_reviews.csv")
print("Elapsed time: ", time.time() - now)

print("Florida...")
florida_reviews = pd.read_csv("../data/raw/florida_reviews.csv")
print("Elapsed time: ", time.time() - now)

print("California...")
california_reviews = pd.read_csv("../data/raw/california_reviews.csv")
print("Elapsed time: ", time.time() - now)

print("\nShapes of data:")
print("Texas: ", texas_reviews.shape)
print("Florida: ", florida_reviews.shape)
print("California: ", california_reviews.shape)

Loading reviews...
Texas...
Elapsed time:  2.7001705169677734
Florida...
Elapsed time:  4.2742979526519775
California...
Elapsed time:  7.649448394775391

Shapes of data:
Texas:  (332098, 6)
Florida:  (195857, 6)
California:  (366643, 6)


In [16]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

In [22]:
texas_reviews['sentiment'] = 0
texas_reviews.head()
print(texas_reviews

332098


In [32]:
texas_reviews[1:400].sentiment.describe()

count    399.000000
mean       4.661236
std        0.662075
min        1.000000
25%        4.416667
50%        5.000000
75%        5.000000
max        5.000000
Name: sentiment, dtype: float64

In [44]:
def get_sentiment(row):
    comment = row['comments']
    print(row.name)
    if len(comment) > 512:
        segments = [comment[i:i + 512] for i in range(0,len(comment),512)]
    else:
        segments = [comment]
    preds = []
    for segment in segments:
        tokens = tokenizer.encode(segment, return_tensors='pt')
        result = model(tokens)
        pred = int(torch.argmax(result.logits))+1
        preds.append(pred)
    
    return sum(preds)/len(preds)



In [45]:
texas_reviews = texas_reviews.apply(get_sentiment,axis = 1)
texas_reviews.to_csv("texas_w_sentiment.csv",index = False)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79


KeyboardInterrupt: 

In [40]:
print(texas_reviews.head())

   listing_id       id        date  reviewer_id reviewer_name  \
0        2265      963  2009-03-17         7538         Niall   
1        2265     1057  2009-03-22        10029       Michael   
2        2265   200418  2011-03-16        61677        Gustaf   
3        2265  1001630  2012-03-15      1523753          Noah   
4        2265  1016390  2012-03-19      1547660       Melissa   

                                            comments  sentiment  
0  I stayed here during SXSW and had a really ple...        5.0  
1  Great place, close enough to everything downto...        5.0  
2  We had a great time in Austin staying at Paddy...        5.0  
3  We had a great stay at Zen East for South By S...        5.0  
4  I arrived late in the evening so did not meet ...        5.0  
