# Loading data

In [110]:
import pandas as pd

df = pd.read_csv('../data/raw/reviews.csv')

In [111]:
df

Unnamed: 0,listing_id,id,date,reviewer_id,comments,numerical_review
0,456333,3101885,2012-12-17,3854237,"Excelente atención, limpia y cómoda habitación...",2
1,490517,683851145362096337,2022-08-01,328362123,Very good place! The host is very friendly and...,5
2,534550,87124297,2016-07-18,12613393,This apartment was in ideal location just a fe...,5
3,534550,466483244276311483,2021-10-05,420785919,I was very surprised at the level of customer ...,5
4,534550,635337027310848865,2022-05-26,8179050,Exceptional reception! Renting this place will...,4
...,...,...,...,...,...,...
805,1286950150256074014,1344908890699583927,2025-01-29,103327179,A estadia foi agradável e dentro do esperado. ...,2
806,1286950150256074014,1368045535342014121,2025-03-02,90926799,"Great location and amenities, very nice host, ...",5
807,1299188100593583919,1368809053051282347,2025-03-03,398440658,"Logement au top et très bien situé, proche des...",4
808,1295823924796594577,1326079104224383525,2025-01-03,656893331,"Świetna lokalizacja, dobry kontakt z gospodarz...",2


# Removing unnecessary columns

In [112]:
columns_to_drop = [
    'listing_id',
    'id',
    'date',
    'reviewer_id'
]

df = df.drop(labels=columns_to_drop, axis=1)

In [113]:
df

Unnamed: 0,comments,numerical_review
0,"Excelente atención, limpia y cómoda habitación...",2
1,Very good place! The host is very friendly and...,5
2,This apartment was in ideal location just a fe...,5
3,I was very surprised at the level of customer ...,5
4,Exceptional reception! Renting this place will...,4
...,...,...
805,A estadia foi agradável e dentro do esperado. ...,2
806,"Great location and amenities, very nice host, ...",5
807,"Logement au top et très bien situé, proche des...",4
808,"Świetna lokalizacja, dobry kontakt z gospodarz...",2


# Translating reviews to english

In [114]:
df['comments_en'] = df['comments'].apply(lambda x: GoogleTranslator(source='auto', target='en').translate(x))
df['comments_en'] = df['comments_en'].fillna(df['comments'])

In [115]:
df

Unnamed: 0,comments,numerical_review,comments_en
0,"Excelente atención, limpia y cómoda habitación...",2,"Excellent attention, clean and comfortable roo..."
1,Very good place! The host is very friendly and...,5,Very good place! The host is very friendly and...
2,This apartment was in ideal location just a fe...,5,This apartment was in ideal location just a fe...
3,I was very surprised at the level of customer ...,5,I was very surprised at the level of customer ...
4,Exceptional reception! Renting this place will...,4,Exceptional reception! Renting this place will...
...,...,...,...
805,A estadia foi agradável e dentro do esperado. ...,2,The stay was pleasant and inside the expected....
806,"Great location and amenities, very nice host, ...",5,"Great location and amenities, very nice host, ..."
807,"Logement au top et très bien situé, proche des...",4,"Housing at the top and very well located, clos..."
808,"Świetna lokalizacja, dobry kontakt z gospodarz...",2,"Great location, good contact with the host, on..."


# Cleaning comments columns

## Getting all unique characters from the comments column

In [116]:
def get_unique_chars(df, column):
    all_comments = ' '.join(df[column].astype(str))
    unique_chars = set(all_comments)
    return sorted(unique_chars)

In [117]:
print(get_unique_chars(df, 'comments_en'))

['\r', ' ', '!', '"', '$', '%', '&', "'", '(', ')', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '~', '²', 'à', 'á', 'ç', 'è', 'é', 'í', 'ñ', 'ò', 'ó', 'ô', 'ü', 'Š', 'А', 'Б', 'В', 'Д', 'К', 'М', 'О', 'Р', 'С', 'Э', 'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ы', 'ь', 'э', 'ю', 'я', '\u200b', '–', '—', '’', '“', '”', '…', '€', '☀', '☺', '❤', 'ㅇ', 'ㅠ', '️', '👌', '👍', '😊', '😍']


## Cleaning

In [118]:
import re

def clean_text(text):
    if not isinstance(text, str):
        return ""

    text = re.sub(r'<br\s*/?>', ' ', text)
    text = re.sub(r'\\[rn]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'☀', '', text)
    text = re.sub(r'☺', '', text)
    text = re.sub(r'❤', '', text)
    text = re.sub(r'ㅇ', '', text)
    text = re.sub(r'ㅠ', '', text)
    text = re.sub(r'👌', '', text)
    text = re.sub(r'👍', '', text)
    text = re.sub(r'😊', '', text)
    text = re.sub(r'😍', '', text)

    return text.strip()

In [119]:
df['comments_clean'] = df['comments_en'].apply(clean_text)

In [120]:
print(get_unique_chars(df, 'comments_clean'))

[' ', '!', '"', '$', '%', '&', "'", '(', ')', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '~', '²', 'à', 'á', 'ç', 'è', 'é', 'í', 'ñ', 'ò', 'ó', 'ô', 'ü', 'Š', 'А', 'Б', 'В', 'Д', 'К', 'М', 'О', 'Р', 'С', 'Э', 'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ы', 'ь', 'э', 'ю', 'я', '\u200b', '–', '—', '’', '“', '”', '…', '€', '️']


In [121]:
df

Unnamed: 0,comments,numerical_review,comments_en,comments_clean
0,"Excelente atención, limpia y cómoda habitación...",2,"Excellent attention, clean and comfortable roo...","Excellent attention, clean and comfortable roo..."
1,Very good place! The host is very friendly and...,5,Very good place! The host is very friendly and...,Very good place! The host is very friendly and...
2,This apartment was in ideal location just a fe...,5,This apartment was in ideal location just a fe...,This apartment was in ideal location just a fe...
3,I was very surprised at the level of customer ...,5,I was very surprised at the level of customer ...,I was very surprised at the level of customer ...
4,Exceptional reception! Renting this place will...,4,Exceptional reception! Renting this place will...,Exceptional reception! Renting this place will...
...,...,...,...,...
805,A estadia foi agradável e dentro do esperado. ...,2,The stay was pleasant and inside the expected....,The stay was pleasant and inside the expected....
806,"Great location and amenities, very nice host, ...",5,"Great location and amenities, very nice host, ...","Great location and amenities, very nice host, ..."
807,"Logement au top et très bien situé, proche des...",4,"Housing at the top and very well located, clos...","Housing at the top and very well located, clos..."
808,"Świetna lokalizacja, dobry kontakt z gospodarz...",2,"Great location, good contact with the host, on...","Great location, good contact with the host, on..."


# Adding sentiment column

In [122]:
from textblob import TextBlob

df['sentiment'] = df['comments_en'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [123]:
df

Unnamed: 0,comments,numerical_review,comments_en,comments_clean,sentiment
0,"Excelente atención, limpia y cómoda habitación...",2,"Excellent attention, clean and comfortable roo...","Excellent attention, clean and comfortable roo...",0.481667
1,Very good place! The host is very friendly and...,5,Very good place! The host is very friendly and...,Very good place! The host is very friendly and...,0.579167
2,This apartment was in ideal location just a fe...,5,This apartment was in ideal location just a fe...,This apartment was in ideal location just a fe...,0.464048
3,I was very surprised at the level of customer ...,5,I was very surprised at the level of customer ...,I was very surprised at the level of customer ...,0.282308
4,Exceptional reception! Renting this place will...,4,Exceptional reception! Renting this place will...,Exceptional reception! Renting this place will...,0.611111
...,...,...,...,...,...
805,A estadia foi agradável e dentro do esperado. ...,2,The stay was pleasant and inside the expected....,The stay was pleasant and inside the expected....,0.316818
806,"Great location and amenities, very nice host, ...",5,"Great location and amenities, very nice host, ...","Great location and amenities, very nice host, ...",0.707500
807,"Logement au top et très bien situé, proche des...",4,"Housing at the top and very well located, clos...","Housing at the top and very well located, clos...",0.225000
808,"Świetna lokalizacja, dobry kontakt z gospodarz...",2,"Great location, good contact with the host, on...","Great location, good contact with the host, on...",-0.006250


# Adding column with length of review translated to english

In [124]:
df['review_length'] = df['comments_en'].str.len()

In [125]:
df

Unnamed: 0,comments,numerical_review,comments_en,comments_clean,sentiment,review_length
0,"Excelente atención, limpia y cómoda habitación...",2,"Excellent attention, clean and comfortable roo...","Excellent attention, clean and comfortable roo...",0.481667,91
1,Very good place! The host is very friendly and...,5,Very good place! The host is very friendly and...,Very good place! The host is very friendly and...,0.579167,82
2,This apartment was in ideal location just a fe...,5,This apartment was in ideal location just a fe...,This apartment was in ideal location just a fe...,0.464048,483
3,I was very surprised at the level of customer ...,5,I was very surprised at the level of customer ...,I was very surprised at the level of customer ...,0.282308,1396
4,Exceptional reception! Renting this place will...,4,Exceptional reception! Renting this place will...,Exceptional reception! Renting this place will...,0.611111,129
...,...,...,...,...,...,...
805,A estadia foi agradável e dentro do esperado. ...,2,The stay was pleasant and inside the expected....,The stay was pleasant and inside the expected....,0.316818,301
806,"Great location and amenities, very nice host, ...",5,"Great location and amenities, very nice host, ...","Great location and amenities, very nice host, ...",0.707500,134
807,"Logement au top et très bien situé, proche des...",4,"Housing at the top and very well located, clos...","Housing at the top and very well located, clos...",0.225000,133
808,"Świetna lokalizacja, dobry kontakt z gospodarz...",2,"Great location, good contact with the host, on...","Great location, good contact with the host, on...",-0.006250,126


# Dropping columns comments and comments_en, renaming comments_clean to comments

In [126]:
df = df.drop(['comments', 'comments_en'], axis=1).rename(columns={'comments_clean': 'comments'})

In [127]:
df

Unnamed: 0,numerical_review,comments,sentiment,review_length
0,2,"Excellent attention, clean and comfortable roo...",0.481667,91
1,5,Very good place! The host is very friendly and...,0.579167,82
2,5,This apartment was in ideal location just a fe...,0.464048,483
3,5,I was very surprised at the level of customer ...,0.282308,1396
4,4,Exceptional reception! Renting this place will...,0.611111,129
...,...,...,...,...
805,2,The stay was pleasant and inside the expected....,0.316818,301
806,5,"Great location and amenities, very nice host, ...",0.707500,134
807,4,"Housing at the top and very well located, clos...",0.225000,133
808,2,"Great location, good contact with the host, on...",-0.006250,126


# Assigning labels

In [128]:
def assign_label(row):
    return 1 if row['numerical_review'] <= 2 or row['sentiment'] < -0.3 else 0

In [129]:
df['label'] = df.apply(assign_label, axis=1)

In [130]:
df

Unnamed: 0,numerical_review,comments,sentiment,review_length,label
0,2,"Excellent attention, clean and comfortable roo...",0.481667,91,1
1,5,Very good place! The host is very friendly and...,0.579167,82,0
2,5,This apartment was in ideal location just a fe...,0.464048,483,0
3,5,I was very surprised at the level of customer ...,0.282308,1396,0
4,4,Exceptional reception! Renting this place will...,0.611111,129,0
...,...,...,...,...,...
805,2,The stay was pleasant and inside the expected....,0.316818,301,1
806,5,"Great location and amenities, very nice host, ...",0.707500,134,0
807,4,"Housing at the top and very well located, clos...",0.225000,133,0
808,2,"Great location, good contact with the host, on...",-0.006250,126,1


# Saving prapared data

In [131]:
df.to_csv('../data/processed/reviews.csv', index=False)