In [23]:
import pandas as pd

df = pd.read_csv("scraping-project.csv")
df.head()

Unnamed: 0,platform,listing_id,title,city,bedrooms,bathrooms,price,rating,review_count,amenities_count,last_scraped
0,airbnb,1474375658014894640,Cabin in Mayrouba,Mayrouba,2,1,108.6,5.0,16,31,19/11/2025 11:35
1,airbnb,1542196639719322486,Villa in Faqra,Faqra,3,2,341.4,5.0,3,37,19/11/2025 11:35
2,airbnb,1526620901894322208,Apartment in Beirut,Beirut,1,1,43.2,5.0,3,22,19/11/2025 11:35
3,airbnb,1200706754310956282,Cabin in Chabtine,Chabtine,1,1,114.2,4.97,65,20,19/11/2025 11:35
4,airbnb,1393555511912612853,Apartment in Beirut,Beirut,1,1,63.6,4.97,32,37,19/11/2025 11:35


In [24]:
print(f"Shape of DataFrame before dropping duplicates: {df.shape}")

df = df.sort_values(by='review_count', ascending=False).drop_duplicates(subset=['title', 'price', 'amenities_count', 'bedrooms', 'bathrooms'], keep='first')

print(f"Shape of DataFrame after dropping duplicates: {df.shape}")
df.head()

Shape of DataFrame before dropping duplicates: (1863, 11)
Shape of DataFrame after dropping duplicates: (1391, 11)


Unnamed: 0,platform,listing_id,title,city,bedrooms,bathrooms,price,rating,review_count,amenities_count,last_scraped
99,airbnb,51726083,Apartment in Beirut,Beirut,,1,35,4.75,99,37,19/11/2025 11:39
1149,Stayinn,132,"Amine 601, 1-BR in Mar Mikhael","Apartment 50 sq. m in Armenia, Beirut, Beirut ...",1 bedroom,1 bathroom,$92 / night,5.0,98 reviews,Show All 43 Amenities,2025-11-24T06:34:34.893310
848,Stayinn,131,"Amine 601, 1-BR in Mar Mikhael","Apartment 50 sq. m in Armenia, Beirut, Beirut ...",1 bedroom,1 bathroom,$91 / night,5.0,98 reviews,Show All 43 Amenities,2025-11-22T06:30:40.175210
400,Stayinn,131,"Amine 601, 1-BR in Mar Mikhael","Apartment 50 sq. m in Armenia, Beirut, Beirut ...",1 bedroom,1 bathroom,$94 / night,5.0,98 reviews,Show All 43 Amenities,2025-11-19T14:27:06.724146
700,Stayinn,133,"Amine 601, 1-BR in Mar Mikhael","Apartment 50 sq. m in Armenia, Beirut, Beirut ...",1 bedroom,1 bathroom,$93 / night,5.0,98 reviews,Show All 43 Amenities,2025-11-21T06:34:34.844513


In [25]:
df.drop(columns=['listing_id'], inplace=True)
df.head()

Unnamed: 0,platform,title,city,bedrooms,bathrooms,price,rating,review_count,amenities_count,last_scraped
99,airbnb,Apartment in Beirut,Beirut,,1,35,4.75,99,37,19/11/2025 11:39
1149,Stayinn,"Amine 601, 1-BR in Mar Mikhael","Apartment 50 sq. m in Armenia, Beirut, Beirut ...",1 bedroom,1 bathroom,$92 / night,5.0,98 reviews,Show All 43 Amenities,2025-11-24T06:34:34.893310
848,Stayinn,"Amine 601, 1-BR in Mar Mikhael","Apartment 50 sq. m in Armenia, Beirut, Beirut ...",1 bedroom,1 bathroom,$91 / night,5.0,98 reviews,Show All 43 Amenities,2025-11-22T06:30:40.175210
400,Stayinn,"Amine 601, 1-BR in Mar Mikhael","Apartment 50 sq. m in Armenia, Beirut, Beirut ...",1 bedroom,1 bathroom,$94 / night,5.0,98 reviews,Show All 43 Amenities,2025-11-19T14:27:06.724146
700,Stayinn,"Amine 601, 1-BR in Mar Mikhael","Apartment 50 sq. m in Armenia, Beirut, Beirut ...",1 bedroom,1 bathroom,$93 / night,5.0,98 reviews,Show All 43 Amenities,2025-11-21T06:34:34.844513


In [26]:
print(f"Shape of DataFrame before dropping rows with excessive missing values: {df.shape}")

missing_values_per_row = df.isnull().sum(axis=1)

df = df[missing_values_per_row <= 3]

print(f"Shape of DataFrame after dropping rows with excessive missing values: {df.shape}")

Shape of DataFrame before dropping rows with excessive missing values: (1391, 10)
Shape of DataFrame after dropping rows with excessive missing values: (1362, 10)


In [27]:
print("DataFrame info before type conversion and imputation:")
df.info()

# Columns to convert to numeric
numeric_cols_to_convert = ['bedrooms', 'bathrooms', 'price', 'review_count', 'amenities_count']

# String cleaning for specific columns before converting to numeric
# Ensure the column is treated as string, handle potential NaN values.

df['bedrooms'] = df['bedrooms'].astype(str).str.replace(r' bedroom(s)?', '', regex=True)
df['bathrooms'] = df['bathrooms'].astype(str).str.replace(r' bathroom(s)?', '', regex=True)
df['price'] = df['price'].astype(str).str.replace(r'[$,/night]', '', regex=True)
df['review_count'] = df['review_count'].astype(str).str.replace(r' reviews', '', regex=True)
df['amenities_count'] = df['amenities_count'].astype(str).str.replace(r'Show All | Amenities', '', regex=True)

for col in numeric_cols_to_convert:
    df[col] = pd.to_numeric(df[col], errors='coerce')

for col in numeric_cols_to_convert:
    if df[col].isnull().any():
        df[col] = df[col].fillna(df[col].mean())

if df['rating'].isnull().any():
    df['rating'] = df['rating'].fillna(df['rating'].mean())

int_cols_to_convert = ['bedrooms', 'bathrooms', 'review_count', 'amenities_count']
for col in int_cols_to_convert:
    df[col] = df[col].astype(int)

print("\nDataFrame info after type conversion and imputation:")
df.info()
df.head()

DataFrame info before type conversion and imputation:
<class 'pandas.core.frame.DataFrame'>
Index: 1362 entries, 99 to 266
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   platform         1362 non-null   object 
 1   title            1362 non-null   object 
 2   city             1357 non-null   object 
 3   bedrooms         1273 non-null   object 
 4   bathrooms        1168 non-null   object 
 5   price            1295 non-null   object 
 6   rating           685 non-null    float64
 7   review_count     1338 non-null   object 
 8   amenities_count  1362 non-null   object 
 9   last_scraped     1362 non-null   object 
dtypes: float64(1), object(9)
memory usage: 117.0+ KB

DataFrame info after type conversion and imputation:
<class 'pandas.core.frame.DataFrame'>
Index: 1362 entries, 99 to 266
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------

Unnamed: 0,platform,title,city,bedrooms,bathrooms,price,rating,review_count,amenities_count,last_scraped
99,airbnb,Apartment in Beirut,Beirut,1,1,35.0,4.75,99,37,19/11/2025 11:39
1149,Stayinn,"Amine 601, 1-BR in Mar Mikhael","Apartment 50 sq. m in Armenia, Beirut, Beirut ...",1,1,92.0,5.0,98,43,2025-11-24T06:34:34.893310
848,Stayinn,"Amine 601, 1-BR in Mar Mikhael","Apartment 50 sq. m in Armenia, Beirut, Beirut ...",1,1,91.0,5.0,98,43,2025-11-22T06:30:40.175210
400,Stayinn,"Amine 601, 1-BR in Mar Mikhael","Apartment 50 sq. m in Armenia, Beirut, Beirut ...",1,1,94.0,5.0,98,43,2025-11-19T14:27:06.724146
700,Stayinn,"Amine 601, 1-BR in Mar Mikhael","Apartment 50 sq. m in Armenia, Beirut, Beirut ...",1,1,93.0,5.0,98,43,2025-11-21T06:34:34.844513


In [28]:
print("Unique cities before standardization:")
print(df['city'].value_counts().head(20))

def standardize_city_names(city_name):
    if pd.isna(city_name):
        return city_name
    city = str(city_name).strip().lower()

    # Specific standardizations
    if 'batroun' in city:
        return 'Batroun'
    if 'beirut' in city:
        return 'Beirut'
    if 'achrafieh' in city or 'ashrafieh' in city:
        return 'Achrafieh'
    if 'mayrouba' in city:
        return 'Mayrouba'
    if 'faqra' in city:
        return 'Faqra'
    if 'chabtine' in city:
        return 'Chabtine'
    if 'jounieh' in city:
        return 'Jounieh'
    if 'byblos' in city or 'jbeil' in city:
        return 'Byblos'
    if 'harissa' in city:
        return 'Harissa'
    if 'zouk mosbeh' in city:
        return 'Zouk Mosbeh'
    if 'rabieh' in city:
        return 'Rabieh'
    if 'baabda' in city:
        return 'Baabda'
    if 'byblos' in city:
        return 'Byblos'
    if 'jeita' in city:
        return 'Jeita'
    if 'ghosta' in city:
        return 'Ghosta'
    if 'kesrouan' in city or 'kesrwane' in city:
        return 'Keserouan'
    if 'berbara' in city:
        return 'Berbara'

    return city_name.title()

df['city'] = df['city'].apply(standardize_city_names)

print("\nUnique cities after standardization:")
print(df['city'].value_counts().head(20))
print("\nDataFrame head after city standardization:")
df.head()

Unique cities before standardization:
city
El Batroun                                                                       207
Keserouan                                                                        176
Beirut                                                                           128
Jbeil                                                                             62
Apartment 200 sq. m in Mazraat Kfar Dibiane, Faqra, Mount Lebanon Governorate     39
Aley                                                                              38
Chouf                                                                             36
Bcharre                                                                           27
Apartment in Berbara, Keserwanâ€‘Jbeil Governorate                                  24
El Meten                                                                          20
Apartment 80 sq. m in Armenia, Beirut, Beirut Governorate                         16
Koura               

Unnamed: 0,platform,title,city,bedrooms,bathrooms,price,rating,review_count,amenities_count,last_scraped
99,airbnb,Apartment in Beirut,Beirut,1,1,35.0,4.75,99,37,19/11/2025 11:39
1149,Stayinn,"Amine 601, 1-BR in Mar Mikhael",Beirut,1,1,92.0,5.0,98,43,2025-11-24T06:34:34.893310
848,Stayinn,"Amine 601, 1-BR in Mar Mikhael",Beirut,1,1,91.0,5.0,98,43,2025-11-22T06:30:40.175210
400,Stayinn,"Amine 601, 1-BR in Mar Mikhael",Beirut,1,1,94.0,5.0,98,43,2025-11-19T14:27:06.724146
700,Stayinn,"Amine 601, 1-BR in Mar Mikhael",Beirut,1,1,93.0,5.0,98,43,2025-11-21T06:34:34.844513


In [29]:
df.to_csv('cleaned_scraping_project.csv', index=False)
print("DataFrame saved to 'cleaned_scraping_project.csv'")

DataFrame saved to 'cleaned_scraping_project.csv'
