In [1]:
import pickle
import os

import pandas as pd

#### Loading Businesses and Reviews

Loading businesses and reviews for Kosovo, Tirana, Sarande, Vlore, Lezhe, Shengjin Counties.

In [2]:
def load_data(path):
    df_unstructured = pd.DataFrame()
    df_reviews = pd.DataFrame()
    for filename in os.listdir(path):
        if filename.split('.')[0].split('_')[-1] == 'data':
            data = pd.DataFrame(pickle.load(open(f"{path}/{filename}", 'rb')))
            df_unstructured = pd.concat([df_unstructured, data])
        else:
            data = pd.DataFrame(pickle.load(open(f"{path}/{filename}", 'rb')))
            df_reviews = pd.concat([df_reviews, data])
    return df_unstructured.reset_index(drop=True), df_reviews.reset_index(drop=True)

In [3]:
df_unstructured, df_reviews = load_data('../datasets')

In [4]:
df_unstructured.head()

Unnamed: 0,business_id,business_name,categories,city,full_address,display_phone,review_count,stars,price_tag,is_claimed,is_closed,coordinates,image,url
0,f3a6c1e4-fd78-4675-82a1-5dfd68c1e1ab,La Pizza Nostra,[Pizza],Tirana County,"Rruga Perlat Rexhepi, Tirana Albania",+355 69 309 9999,15,5.0,$,True,False,"[41.318115, 19.817093]","data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEK...",https://www.tripadvisor.com/Restaurant_Review-...
1,360fcd61-8ed6-4e98-bd4d-9ab50fa8c290,Cioccolatitaliani Kalaja,[],Tirana County,"Rruga Murat Toptani, Tirana 1001 Albania",,12,3.5,$$$$,True,False,"[41.325882, 19.8225]","data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEK...",https://www.tripadvisor.com/Restaurant_Review-...
2,f9912145-1a1a-41fe-89c6-243e37825eb6,Baza Bar,[Fast Food],Tirana County,"Rruga Gjin Bue Shpata 10, Tirana 1001, Tirana ...",+355 69 725 4485,0,0.0,No price tag,False,True,"[41.320923, 19.811533]",,https://www.tripadvisor.com/Restaurant_Review-...
3,55a1094a-84fe-42de-b3a2-e5aed8a543ac,Bar Restorant Piceri Colombo,"[Italian, Mediterranean, European, Greek]",Tirana County,"Rr. Reshit Petrela, Ish Stacioni Trenit, 100m ...",+355 69 693 7666,5,3.0,No price tag,False,False,"[41.335743, 19.81529]","data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEK...",https://www.tripadvisor.com/Restaurant_Review-...
4,021324f7-7875-430f-9026-18ebaece43e0,Casa della Pasta,"[Italian, European, Albanian]",Tirana County,"Rruga Halim Xhelo 11, Tirana 1023 Albania",+355 68 908 3863,3,4.0,$,True,False,"[41.32788, 19.806349]","data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEK...",https://www.tripadvisor.com/Restaurant_Review-...


In [5]:
df_reviews.head()

Unnamed: 0,user_id,business_id,review_id,review_date,review_title,review_text,rating,votes
0,UID_64C07AC09E188BCE7A081939BFDAD66B-SRC_77836...,43cae344-9533-484c-bc4e-c99095cd2099,778365530,"December 3, 2020",,More,50,0
1,UID_0FE0E6284F6C99666368DE11341334C1-SRC_50595...,43cae344-9533-484c-bc4e-c99095cd2099,505951496,"July 26, 2017",,More,40,0
2,UID_42249550DB6692C78F9823BBE41F7D29-SRC_49966...,43cae344-9533-484c-bc4e-c99095cd2099,499660609,"July 8, 2017",,More,30,0
3,UID_53D877480BD8A0719BC4FEC97040DD59-SRC_79131...,8eeec4a0-4825-40ce-8169-f0a20c2d3410,791318013,"June 5, 2021",Excellent food at affordable price,The best restaurant in Shengjin. Fresh fish d...,50,1
4,UID_88FF89326964E5FA9090E8BD9D292620-SRC_15419...,8eeec4a0-4825-40ce-8169-f0a20c2d3410,154191986,"March 10, 2013",Place for fresh seafood,If you really want to have FRESH seafood this ...,30,1


In [6]:
df_unstructured.columns

Index(['business_id', 'business_name', 'categories', 'city', 'full_address',
       'display_phone', 'review_count', 'stars', 'price_tag', 'is_claimed',
       'is_closed', 'coordinates', 'image', 'url'],
      dtype='object')

Each encrypted business_id from the df_unstructured dataframe has corresponding entries in the df_reviews dataframe with the same business_id.

In [7]:
# Reviews of the first entry of the df unstructured dataframe

In [8]:
df_reviews[df_reviews['business_id'] == df_unstructured['business_id'][0]]

Unnamed: 0,user_id,business_id,review_id,review_date,review_title,review_text,rating,votes
215,UID_F1FBBF099CB5E257C5FFBFB2EFA69112-SRC_86958...,f3a6c1e4-fd78-4675-82a1-5dfd68c1e1ab,869583586,"November 24, 2022",It’s wonderful!,This place is probably the best pizza place in...,50,
216,UID_9F59B845FA08C2835A51FB99F2174815-SRC_84672...,f3a6c1e4-fd78-4675-82a1-5dfd68c1e1ab,846724478,"July 7, 2022",Recommend strong!!,The food was really good and with normal price...,50,
217,UID_300C91DAF0132F4BD631CB38AFED6A1A-SRC_84392...,f3a6c1e4-fd78-4675-82a1-5dfd68c1e1ab,843928628,"June 21, 2022",A great place to share with family,"The manager and the personal speak albanian, I...",50,
218,UID_EAE1D79426F499C5098C91A21F8DAB9B-SRC_84188...,f3a6c1e4-fd78-4675-82a1-5dfd68c1e1ab,841889382,"June 7, 2022",Best pizza in Tirana,John and his team were the most welcoming… we ...,50,
219,UID_8AF59B67BF1FC0017CDD1D76EDE70DDC-SRC_84188...,f3a6c1e4-fd78-4675-82a1-5dfd68c1e1ab,841888398,"June 7, 2022",La pizza Nostra,"Our first time in Albania, John and his team w...",50,
220,UID_7AAA8B789D3FCE0C0CE4D94B8617833C-SRC_84071...,f3a6c1e4-fd78-4675-82a1-5dfd68c1e1ab,840711157,"May 30, 2022",One of the best pizzas you’ll ever eat,The quality of the pizza…It was nothing short ...,50,
221,UID_D0BE0FBC7BC03C9E0E03DC1DDA88D16C-SRC_83571...,f3a6c1e4-fd78-4675-82a1-5dfd68c1e1ab,835712730,"April 23, 2022",Simply the Best Pizza in Tirana!!,"First, La Pizza Nostra Restaurant is very clea...",50,
222,UID_B1CC77610B15AD8AA861D3AF815D8F1A-SRC_83465...,f3a6c1e4-fd78-4675-82a1-5dfd68c1e1ab,834651099,"April 15, 2022",Very nice restaurant. The service was great.,Very tasty pizza. This is one of the best pizz...,50,
223,UID_7A0487278527BC71B4B9DDDB1D963802-SRC_83385...,f3a6c1e4-fd78-4675-82a1-5dfd68c1e1ab,833851385,"April 8, 2022",Best Pizza in Tirana,This is literally the best Pizzeria in Tirana....,50,1.0


In [9]:
df_unstructured['business_name'][0]

'La Pizza Nostra'

#### Cleaning Unstructured DataFrame

In [10]:
df_unstructured.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1632 entries, 0 to 1631
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   business_id    1632 non-null   object
 1   business_name  1627 non-null   object
 2   categories     1632 non-null   object
 3   city           1632 non-null   object
 4   full_address   1627 non-null   object
 5   display_phone  1425 non-null   object
 6   review_count   1632 non-null   object
 7   stars          1632 non-null   object
 8   price_tag      1632 non-null   object
 9   is_claimed     1632 non-null   bool  
 10  is_closed      1632 non-null   bool  
 11  coordinates    1632 non-null   object
 12  image          1369 non-null   object
 13  url            1632 non-null   object
dtypes: bool(2), object(12)
memory usage: 156.3+ KB


Review Count seems to be an object type and might contain number such as 1,666 as string so we need to convert it to float.

In [11]:
def replace_comma(x):
    if x == None:
        return 0.0
    elif ',' in x:
        return float(x.replace(',', ''))
    else:
        return float(x)
    
df_unstructured['review_count'] = df_unstructured['review_count'].apply(lambda x: replace_comma(x))

In [12]:
# Convert stars to float
df_unstructured['stars'] = df_unstructured['stars'].apply(lambda x: float(x))

Price tag seems to be an object with $ elements and if no element then 'No price tag'. We need to arrange it to a set of numbers.

In [13]:
df_unstructured['price_tag'].value_counts()

No price tag    703
$$ - $$$        501
$               366
$$$$             62
Name: price_tag, dtype: int64

In [14]:
def process_price_tag(x):
    if x is None or x == '$' or x == 'No price tag':
        return 1.0
    elif x == '$$ - $$$':
        return 2.5
    elif x == '$$$$':
        return 4
    
df_unstructured['price_tag'] = df_unstructured['price_tag'].apply(lambda x: process_price_tag(x))

Coordinates looks to be in a list with latitude and longitude. We need to normalize it into two columns.

In [15]:
df_unstructured['latitude'] = df_unstructured['coordinates'].apply(lambda x: x[0])
df_unstructured['longitude'] = df_unstructured['coordinates'].apply(lambda x: x[1])

In [16]:
df_unstructured.drop('coordinates', axis=1, inplace=True)

In [17]:
df_unstructured.isnull().sum()

business_id        0
business_name      5
categories         0
city               0
full_address       5
display_phone    207
review_count       0
stars              0
price_tag          0
is_claimed         0
is_closed          0
image            263
url                0
latitude           0
longitude          0
dtype: int64

In [18]:
def check_null(df):
    percent_missing = df.isnull().sum() * 100 / len(df)
    missing_value_df = pd.DataFrame({'column_name': df.columns,
                                     'percent_missing': percent_missing})
    missing_value_df.sort_values('percent_missing', inplace=True)
    return missing_value_df

In [19]:
check_null(df_unstructured)

Unnamed: 0,column_name,percent_missing
business_id,business_id,0.0
categories,categories,0.0
city,city,0.0
review_count,review_count,0.0
stars,stars,0.0
price_tag,price_tag,0.0
is_claimed,is_claimed,0.0
is_closed,is_closed,0.0
url,url,0.0
latitude,latitude,0.0


We can see that the columns that are missing mostly are image and display phone. However, we don't need to drop these values since image and display phone are not important columns.

Moreover, we see a small portion of businesses that are missing the name and full_address. We will drop these values

In [20]:
df_unstructured.dropna(subset=['business_name', 'full_address'], inplace=True)

In [21]:
df_unstructured.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1627 entries, 0 to 1631
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   business_id    1627 non-null   object 
 1   business_name  1627 non-null   object 
 2   categories     1627 non-null   object 
 3   city           1627 non-null   object 
 4   full_address   1627 non-null   object 
 5   display_phone  1425 non-null   object 
 6   review_count   1627 non-null   float64
 7   stars          1627 non-null   float64
 8   price_tag      1627 non-null   float64
 9   is_claimed     1627 non-null   bool   
 10  is_closed      1627 non-null   bool   
 11  image          1365 non-null   object 
 12  url            1627 non-null   object 
 13  latitude       1627 non-null   float64
 14  longitude      1627 non-null   float64
dtypes: bool(2), float64(5), object(8)
memory usage: 181.1+ KB


In [22]:
df_cleaned = df_unstructured

#### Cleaning Reviews DataFrame

In [23]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8403 entries, 0 to 8402
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   user_id       8401 non-null   object
 1   business_id   8403 non-null   object
 2   review_id     8403 non-null   object
 3   review_date   8403 non-null   object
 4   review_title  8403 non-null   object
 5   review_text   8403 non-null   object
 6   rating        8403 non-null   object
 7   votes         8403 non-null   object
dtypes: object(8)
memory usage: 525.3+ KB


In [24]:
# first of let us turn review date into a datetime64[ns] object
df_reviews['review_date'] = pd.to_datetime(df_reviews['review_date'])

In [25]:
# Let's check for null values
check_null(df_reviews)

Unnamed: 0,column_name,percent_missing
business_id,business_id,0.0
review_id,review_id,0.0
review_date,review_date,0.0
review_title,review_title,0.0
review_text,review_text,0.0
rating,rating,0.0
votes,votes,0.0
user_id,user_id,0.023801


We can see that there is only a really smart portion of nulls in user_id; but we do not need to drop them

Let strip review text and title

In [26]:
df_reviews['review_text'] = df_reviews['review_text'].apply(lambda x: x.strip())
df_reviews['review_title'] = df_reviews['review_title'].apply(lambda x: x.strip())
df_reviews['votes'] = df_reviews['votes'].apply(lambda x: x.strip())

We also need to normalize ratings from 1-5

In [27]:
df_reviews['rating'] = df_reviews['rating'].apply(lambda x: float(x))
df_reviews['rating'] = df_reviews['rating']/10

Let's turn empty votes string to 0

In [28]:
df_reviews['votes'].value_counts()

      6073
1     1495
2      501
3      140
0      113
4       50
5       17
6        4
20       2
12       2
7        2
11       1
16       1
10       1
15       1
Name: votes, dtype: int64

In [29]:
def process_votes(x):
    if not x:
        return 0.0
    else:
        return float(x)
    
df_reviews['votes'] = df_reviews['votes'].apply(lambda x: process_votes(x))

In [30]:
df_reviews['votes'].value_counts()

0.0     6186
1.0     1495
2.0      501
3.0      140
4.0       50
5.0       17
6.0        4
20.0       2
12.0       2
7.0        2
11.0       1
16.0       1
10.0       1
15.0       1
Name: votes, dtype: int64

In [31]:
df_reviews['review_title'] = df_reviews['review_title'].apply(lambda x:'No title' if x == '' else x)
df_reviews['review_text'] = df_reviews['review_text'].apply(lambda x:'No text' if x == '' or x == 'More' else x)

In [32]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8403 entries, 0 to 8402
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   user_id       8401 non-null   object        
 1   business_id   8403 non-null   object        
 2   review_id     8403 non-null   object        
 3   review_date   8403 non-null   datetime64[ns]
 4   review_title  8403 non-null   object        
 5   review_text   8403 non-null   object        
 6   rating        8403 non-null   float64       
 7   votes         8403 non-null   float64       
dtypes: datetime64[ns](1), float64(2), object(5)
memory usage: 525.3+ KB


#### Saving cleaned data

Saving data info and data reviews to 
```
datasets\unprocessed_clean
```

In [33]:
df_cleaned['categories'].value_counts()

[European, Albanian]                               168
[]                                                 142
[Italian, European, Albanian]                       52
[Italian, Seafood, Mediterranean]                   44
[Italian]                                           42
                                                  ... 
[Italian, French, European, Turkish]                 1
[Italian, French, International, Mediterranean]      1
[Pizza, European, Grill]                             1
[Fast Food, Middle Eastern]                          1
[Seafood, Mediterranean, European, Pub]              1
Name: categories, Length: 575, dtype: int64

In [36]:
def save(path):
    os.makedirs(path, exist_ok=True)
    df_cleaned.to_pickle(f"{path}/businesses_data_cleaned.pkl")
    df_reviews.to_pickle(f"{path}/businesses_reviews_cleaned.pkl")
    
save('../datasets/unprocessed_clean')