In [1]:
%store -r df_reviews

In [2]:
df_reviews

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,323255,992614,2012-03-12,1354573,Renato,"I had a relaxing time in Berlin, it looks so o..."
1,3176,4283,2009-06-20,21475,Milind,"excellent stay, i would highly recommend it. a..."
2,3176,134722,2010-11-07,263467,George,Britta's apartment in Berlin is in a great are...
3,3176,144064,2010-11-24,76726,Patricia,"Fantastic, large place in good location. Only ..."
4,3176,156702,2010-12-21,291657,Benedetta,L'appartamento di Britta è molto largo carino ...
...,...,...,...,...,...,...
588587,1363985791474119923,1371700285888448045,2025-03-07,246246815,Sean,Enjoyed staying at Laura's place while in Berl...
588588,1364272993672534452,1365911234905406942,2025-02-27,66290251,Felix,I stayed in Niklas’ penthouse for a few days i...
588589,1364272993672534452,1366622592968053368,2025-02-28,51293498,Matteo,"The place is really amazing, I loved the desig..."
588590,1364272993672534452,1367344527016221502,2025-03-01,3009394,David,My stay with Niklas in Berlin was just fantast...


In [3]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 588592 entries, 0 to 588591
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   listing_id     588592 non-null  int64 
 1   id             588592 non-null  int64 
 2   date           588592 non-null  object
 3   reviewer_id    588592 non-null  int64 
 4   reviewer_name  588591 non-null  object
 5   comments       588552 non-null  object
dtypes: int64(3), object(3)
memory usage: 26.9+ MB


In [4]:
df_reviews.shape

(588592, 6)

In [5]:
df_reviews.columns

Index(['listing_id', 'id', 'date', 'reviewer_id', 'reviewer_name', 'comments'], dtype='object')

In [6]:
import pandas as pd
import numpy as np

In [7]:
df_reviews['date'] = pd.to_datetime(df_reviews['date'], errors='coerce')

In [8]:
df_reviews_clean = df_reviews.dropna(subset=['listing_id', 'comments']).copy()

In [9]:
%store -r lst

In [10]:
df_reviews_clean = df_reviews_clean[df_reviews_clean['listing_id'].isin(lst['id'])]

In [11]:
print(f"After basic cleaning: {df_reviews_clean.shape}")
print(f"Unique listings with reviews: {df_reviews_clean['listing_id'].nunique()}")

After basic cleaning: (588303, 6)
Unique listings with reviews: 10618


In [12]:
import pandas as pd
import numpy as np
import re
from langdetect import detect, LangDetectException

def filter_english_reviews(df, comment_col='comments'):
    """
    Filter DataFrame to keep only English reviews with high confidence
    """
    print(f"Starting with {len(df):,} reviews")
    
    def is_english(text):
        """Determine if text is English using multiple checks"""
        if pd.isna(text) or not str(text).strip():
            return False
        
        text = str(text).strip()
        
        # Check 1: Minimum length (at least 3 words)
        if len(text.split()) < 3:
            return False
        
        # Check 2: Mostly Latin characters
        alpha_chars = [c for c in text if c.isalpha()]
        if alpha_chars:
            latin_ratio = sum(1 for c in alpha_chars if ord(c) < 256) / len(alpha_chars)
            if latin_ratio < 0.8:  # Less than 80% Latin characters
                return False
        
        # Check 3: Common English words (hospitality context)
        english_indicators = [
            'the', 'and', 'was', 'very', 'great', 'good', 'nice', 'place', 'stay',
            'location', 'clean', 'host', 'apartment', 'room', 'would', 'recommend',
            'amazing', 'perfect', 'beautiful', 'comfortable', 'friendly', 'helpful'
        ]
        
        text_lower = text.lower()
        english_word_count = sum(1 for word in english_indicators if f' {word} ' in f' {text_lower} ')
        
        if english_word_count < 1:
            return False
        
        # Check 4: Language detection
        try:
            detected = detect(text)
            if detected != 'en':
                return False
        except (LangDetectException, Exception):
            # If detection fails, need at least 2 English indicators
            if english_word_count < 2:
                return False
        
        # Check 5: Exclude obvious non-English patterns
        non_english_patterns = [
            r'[àáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ]',  # Accented characters
            r'\b(sehr|ist|das|die|der|und|mit|ich|war|haben|nicht)\b',  # German
            r'\b(très|est|dans|pour|avec|sur|une|nous|vous|tout)\b',  # French
            r'\b(muy|está|para|con|por|una|pero|todo|más|bien)\b',  # Spanish
            r'\b(molto|anche|sono|della|questa|tutto|proprio)\b',  # Italian
        ]
        
        for pattern in non_english_patterns:
            if re.search(pattern, text_lower):
                return False
        
        return True
    
    # Apply English filter
    english_mask = df[comment_col].apply(is_english)
    df_english = df[english_mask].copy()
    
    print(f"After English filtering: {len(df_english):,} reviews")
    print(f"Removed: {len(df) - len(df_english):,} reviews ({(len(df) - len(df_english))/len(df)*100:.1f}%)")
    
    # Clean the text
    def clean_text(text):
        """Clean and standardize text"""
        if pd.isna(text):
            return np.nan
        
        text = str(text).strip()
        
        # Remove excessive punctuation/symbols, keep basic punctuation
        text = re.sub(r'[^\w\s.,!?()-]', ' ', text)
        
        # Normalize whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Final length check
        if len(text.split()) < 3:
            return np.nan
        
        return text
    
    # Apply cleaning
    df_english['comments_clean'] = df_english[comment_col].apply(clean_text)
    df_english = df_english.dropna(subset=['comments_clean'])
    
    print(f"After text cleaning: {len(df_english):,} reviews")
    print(f"Unique listings: {df_english['listing_id'].nunique():,}")
    
    return df_english

# Apply to your cleaned data
df_english_reviews = filter_english_reviews(df_reviews_clean)

# Show results
print(f"\n📊 Final Results:")
print(f"English reviews: {len(df_english_reviews):,}")
print(f"Average review length: {df_english_reviews['comments_clean'].str.len().mean():.0f} characters")
print(f"Reviews per listing (avg): {len(df_english_reviews) / df_english_reviews['listing_id'].nunique():.1f}")

# Sample reviews
print(f"\n📝 Sample English reviews:")
for i, comment in enumerate(df_english_reviews['comments_clean'].head(3)):
    print(f"{i+1}. {comment[:120]}...")

print(f"\n✅ Ready for your ML project!")

Starting with 588,303 reviews
After English filtering: 304,756 reviews
Removed: 283,547 reviews (48.2%)
After text cleaning: 304,691 reviews
Unique listings: 9,840

📊 Final Results:
English reviews: 304,691
Average review length: 235 characters
Reviews per listing (avg): 31.0

📝 Sample English reviews:
1. I had a relaxing time in Berlin, it looks so open to people with different background, cultures and tastes. I m surely c...
2. excellent stay, i would highly recommend it. a nice flat in a very nice area. Britta provided clear instructions in secu...
3. Britta s apartment in Berlin is in a great area. There are numerous fantastic Restaurants and Bars to suit every taste, ...

✅ Ready for your ML project!


In [18]:
df_english_reviews = df_english_reviews.drop('comments',axis=1)

In [20]:
%store df_english_reviews

Stored 'df_english_reviews' (DataFrame)


In [22]:
review_counts = df_english_reviews['listing_id'].value_counts().reset_index()
review_counts.columns = ['listing_id', 'review_count']

# Sort by listing_id for cleaner display
review_counts = review_counts.sort_values('listing_id').reset_index(drop=True)

print(f"Review count DataFrame created with {len(review_counts)} listings")
print(f"\nFirst 5 rows:")
print(review_counts.head())

print(f"\nSummary statistics:")
print(review_counts['review_count'].describe())


Review count DataFrame created with 9840 listings

First 5 rows:
   listing_id  review_count
0        3176           121
1        9991             6
2       14325            22
3       16644            36
4       17904           222

Summary statistics:
count    9840.000000
mean       30.964533
std        60.176782
min         1.000000
25%         3.000000
50%         9.000000
75%        30.000000
max      1541.000000
Name: review_count, dtype: float64


In [26]:
%store review_counts

Stored 'review_counts' (DataFrame)


In [27]:
review_count_per_listing = df_english_reviews['listing_id'].value_counts()

print(f"Total unique listings: {len(review_count_per_listing)}")
print(f"Average reviews per listing: {review_count_per_listing.mean():.2f}")

print("\nTop 10 listings by review count:")
print(review_count_per_listing.head(10))

Total unique listings: 9840
Average reviews per listing: 30.96

Top 10 listings by review count:
listing_id
34294913              1541
49070135              1127
45475252               814
46864858               784
43798011               766
49844243               709
721527807549293451     616
47215807               544
32539441               531
264459                 518
Name: count, dtype: int64
