<a href="https://colab.research.google.com/github/Michwynn/London-Airbnb-Analysis---2/blob/Elias/Airbnb_Data_Base_Cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Configuration and library set-up**

In [1]:
# data manipulation
import pandas as pd
import numpy as np 
import re # regex
import random
from collections import Counter, defaultdict

# machine learning
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

# timeit
from tqdm import tqdm

# data visualisation
import altair as alt
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

# set up working directory
import os
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/Airbnb_Milestone2

# supress warnings
import warnings 
warnings.filterwarnings('ignore')

# Display all columns
pd.set_option('display.max_columns', None)

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1wUOfFY-ki2nFzneeaTtXLEeMjaSdKrrj/Airbnb_Milestone2


In [4]:
# read data
reviews_df = pd.read_csv('Datasets/reviews.csv') 
# remove white spaces in column headings
reviews_df.columns = reviews_df.columns.str.strip() 
# Examine dataset
display(reviews_df)

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,52228441,623723762668719111,2022-05-10,37052865,Kimberly,"Great location, and the host was very responsi..."
1,52228441,505671819125096360,2021-11-28,70830110,Mahelet,Duccio is a lovely and friendly host. From arr...
2,52228441,466510411892882382,2021-10-05,83617224,Will,Duccio is a good communicator… he was very hel...
3,52228441,604109461995958546,2022-04-13,2152541,Francesco,Not entirely compliant to the pics.<br/>Good l...
4,605617198416835367,633128504578904919,2022-05-23,45418187,Waddah,Great place and great host
...,...,...,...,...,...,...
1216207,14832630,184884203,2017-08-20,54407484,丽云,干净舒适，适合家庭入住，地段优越，景点全部很多都可以步行到达，节省了交通费。<br/>房间设...
1216208,14832630,173531056,2017-07-23,130284505,Véronique,Excellente situation au plein centre de Londre...
1216209,14832630,153170957,2017-05-20,119296298,Nicole,"Todo perfecto !! Ubicación, comodidad y Rachel..."
1216210,14832630,145456398,2017-04-18,16394435,Christophe,Tout d'abord même si nous n'avons pas eu le pl...




In [12]:
# installing and Importing fasttext library for language identification
!pip install fasttext
import fasttext
# settting up the pretrained model
fasttxt_model = fasttext.load_model("lid.176.bin")

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/




In [15]:
# Test model with example text
fasttxt_model.predict('이 집은 정말 좋아요! 맘에 쏙 드네요')[0][0][-2:]

'ko'

In [14]:
### removing the end spaces
reviews_df['cleaned_comments'] = reviews_df['comments'].str.strip()
### Keeping rows with cleaned comments != NaN
reviews_df = reviews_df[(reviews_df.cleaned_comments.notna() == 1)]

reviews_df['lang'] = reviews_df['cleaned_comments'].apply(lambda x: fasttxt_model.predict(x)[0][0][-2:])
display(reviews_df)

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,cleaned_comments,lang
0,52228441,623723762668719111,2022-05-10,37052865,Kimberly,"Great location, and the host was very responsi...","Great location, and the host was very responsi...",en
1,52228441,505671819125096360,2021-11-28,70830110,Mahelet,Duccio is a lovely and friendly host. From arr...,Duccio is a lovely and friendly host. From arr...,en
2,52228441,466510411892882382,2021-10-05,83617224,Will,Duccio is a good communicator… he was very hel...,Duccio is a good communicator… he was very hel...,en
3,52228441,604109461995958546,2022-04-13,2152541,Francesco,Not entirely compliant to the pics.<br/>Good l...,Not entirely compliant to the pics.<br/>Good l...,en
4,605617198416835367,633128504578904919,2022-05-23,45418187,Waddah,Great place and great host,Great place and great host,en
...,...,...,...,...,...,...,...,...
1216207,14832630,184884203,2017-08-20,54407484,丽云,干净舒适，适合家庭入住，地段优越，景点全部很多都可以步行到达，节省了交通费。<br/>房间设...,干净舒适，适合家庭入住，地段优越，景点全部很多都可以步行到达，节省了交通费。<br/>房间设...,en
1216208,14832630,173531056,2017-07-23,130284505,Véronique,Excellente situation au plein centre de Londre...,Excellente situation au plein centre de Londre...,fr
1216209,14832630,153170957,2017-05-20,119296298,Nicole,"Todo perfecto !! Ubicación, comodidad y Rachel...","Todo perfecto !! Ubicación, comodidad y Rachel...",es
1216210,14832630,145456398,2017-04-18,16394435,Christophe,Tout d'abord même si nous n'avons pas eu le pl...,Tout d'abord même si nous n'avons pas eu le pl...,fr
