In [None]:
import pandas as pd

# Example travel review data
data = {
    "Rating": [5, 2, 4, 5],
    "Price": [120, 95, 200, 150],
    "Review": [
        "Great location, very clean",
        "Not worth the price",
        "Will definitely stay again",
        "This hotel was the bomb!"
    ]
}

df = pd.DataFrame(data)
print(df)

   Rating  Price                      Review
0       5    120  Great location, very clean
1       2     95         Not worth the price
2       4    200  Will definitely stay again
3       5    150    This hotel was the bomb!


In [None]:
# Calculate the average hotel rating
average_rating = df['Rating'].mean()
print("Average rating:", average_rating)

# Calculate the average price
average_price = df['Price'].mean()
print("Average price:", average_price)

Average rating: 4.0
Average price: 141.25


In [None]:
# Try to calculate the 'average' review (doesn't work!)
try:
    print(df['Review'].mean())
except Exception as e:
    print("Error when trying to calculate mean of reviews:", e)

Error when trying to calculate mean of reviews: Could not convert string 'Great location, very cleanNot worth the priceWill definitely stay againThis hotel was the bomb!' to numeric


In [None]:
def travel_eliza_bot(user_input):
    user_input = user_input.lower()
    # Rule-based responses
    if "book" in user_input or "reservation" in user_input:
        return "I can help you book your trip! Where do you want to go?"
    elif "cancel" in user_input:
        return "I'm sorry to hear that you want to cancel. Which reservation should I look up?"
    elif "recommend" in user_input or "suggest" in user_input:
        return "Sure! What type of destination are you interested in—beaches, cities, or mountains?"
    elif "hello" in user_input or "hi" in user_input:
        return "Hello! How can I assist you with your travel plans today?"
    elif "thank" in user_input:
        return "You're welcome! Let me know if you have more travel questions."
    elif "price" in user_input or "cost" in user_input:
        return "I can check prices for you. Are you looking for flights, hotels, or both?"
    else:
        return "Can you please tell me more about your travel needs?"

# Try out the bot
print("TravelBot: Hello! How can I assist you with your travel plans today?")
while True:
    user = input("You: ")
    if user.lower() in ["exit", "quit", "bye"]:
        print("TravelBot: Safe travels! Goodbye.")
        break
    response = travel_eliza_bot(user)
    print("TravelBot:", response)

TravelBot: Hello! How can I assist you with your travel plans today?
You: Can you recommend a hotel in Paris?’
TravelBot: Sure! What type of destination are you interested in—beaches, cities, or mountains?


KeyboardInterrupt: Interrupted by user

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# Download NLTK resources (run once)
nltk.download('punkt')
nltk.download('punkt_tab')                  # ← ensure this is installed
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Sample travel review data
data = {
    "Rating": [5, 2, 4, 5],
    "Price": [120, 95, 200, 150],
    "Review": [
        "Great location, very clean!",
        "Not worth the price.",
        "Will definitely stay again.",
        "This hotel was the bomb!!! 😊"
    ]
}

df = pd.DataFrame(data)
print("Original Data:")
print(df)

# Step 1: Lowercase conversion (standardization)
df['clean_review'] = df['Review'].str.lower()

# Step 2: Remove punctuation and special characters (noise removal)
def remove_noise(text):
    text = re.sub(r'[^\w\s]', '', text)     # Remove punctuation
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII (e.g., emojis)
    return text

df['clean_review'] = df['clean_review'].apply(remove_noise)

# Step 3: Tokenization (split into words)
df['tokens'] = df['clean_review'].apply(nltk.word_tokenize)

# Step 4: Remove stopwords (common uninformative words)
stop_words = set(stopwords.words('english'))
df['tokens_no_stop'] = df['tokens'].apply(
    lambda tokens: [w for w in tokens if w not in stop_words]
)

# Step 5: Lemmatization (reduce words to base form)
lemmatizer = WordNetLemmatizer()
df['lemmatized'] = df['tokens_no_stop'].apply(
    lambda tokens: [lemmatizer.lemmatize(t) for t in tokens]
)

# Display the processed DataFrame columns
print("\nProcessed Text Data:")
print(df[['Review', 'clean_review', 'tokens', 'tokens_no_stop', 'lemmatized']])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Original Data:
   Rating  Price                        Review
0       5    120   Great location, very clean!
1       2     95          Not worth the price.
2       4    200   Will definitely stay again.
3       5    150  This hotel was the bomb!!! 😊

Processed Text Data:
                         Review                clean_review  \
0   Great location, very clean!   great location very clean   
1          Not worth the price.         not worth the price   
2   Will definitely stay again.  will definitely stay again   
3  This hotel was the bomb!!! 😊    this hotel was the bomb    

                            tokens            tokens_no_stop  \
0   [great, location, very, clean]  [great, location, clean]   
1         [not, worth, the, price]            [worth, price]   
2  [will, definitely, stay, again]        [definitely, stay]   
3    [this, hotel, was, the, bomb]             [hotel, bomb]   

                 lemmatized  
0  [great, location, clean]  
1            [worth, price]  
2

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# Download NLTK resources (run once)
nltk.download('punkt')                      # Tokenizer
nltk.download('punkt_tab')                  # ← ensure this is installed
nltk.download('stopwords')                  # Stopwords list
nltk.download('wordnet')                    # Lemmatizer dictionary
nltk.download('averaged_perceptron_tagger')# POS tagger (optional)

# Sample travel review data
data = {
    "Rating": [5, 2, 4, 5],
    "Price": [120, 95, 200, 150],
    "Review": [
        "Great location, very clean!",
        "Not worth the price.",
        "Will definitely stay again.",
        "This hotel was the bomb!!! 😊"
    ]
}

df = pd.DataFrame(data)
print("Original Data:")
print(df)

# Step 1: Lowercase conversion (standardization)
df['clean_review'] = df['Review'].str.lower()

# Step 2: Remove punctuation and special characters using str.replace with regex=True
df['clean_review'] = df['clean_review'].str.replace(r'[^\w\s]', '', regex=True)  # remove punctuation

# Step 3: Remove non-ASCII characters (like emojis) using str.replace with regex=True
df['clean_review'] = df['clean_review'].str.replace(r'[^\x00-\x7F]+', '', regex=True)

# Step 4: Tokenization (split into words)
df['tokens'] = df['clean_review'].apply(nltk.word_tokenize)

# Step 5: Remove stopwords (common uninformative words)
stop_words = set(stopwords.words('english'))
df['tokens_no_stop'] = df['tokens'].apply(lambda tokens: [w for w in tokens if w not in stop_words])

# Step 6: Lemmatization (reduce words to base form)
lemmatizer = WordNetLemmatizer()
df['lemmatized'] = df['tokens_no_stop'].apply(lambda tokens: [lemmatizer.lemmatize(t) for t in tokens])

# Display the processed DataFrame columns
print("\nProcessed Text Data:")
print(df[['Review', 'clean_review', 'tokens', 'tokens_no_stop', 'lemmatized']])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Original Data:
   Rating  Price                        Review
0       5    120   Great location, very clean!
1       2     95          Not worth the price.
2       4    200   Will definitely stay again.
3       5    150  This hotel was the bomb!!! 😊

Processed Text Data:
                         Review                clean_review  \
0   Great location, very clean!   great location very clean   
1          Not worth the price.         not worth the price   
2   Will definitely stay again.  will definitely stay again   
3  This hotel was the bomb!!! 😊    this hotel was the bomb    

                            tokens            tokens_no_stop  \
0   [great, location, very, clean]  [great, location, clean]   
1         [not, worth, the, price]            [worth, price]   
2  [will, definitely, stay, again]        [definitely, stay]   
3    [this, hotel, was, the, bomb]             [hotel, bomb]   

                 lemmatized  
0  [great, location, clean]  
1            [worth, price]  
2

In [3]:
import pandas as pd
import re, unicodedata
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

# -----------------------------
# NLTK resources (run once)
# -----------------------------
nltk.download("punkt")
nltk.download('punkt_tab')                  # ← ensure this is installed
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger")
nltk.download('averaged_perceptron_tagger_eng') # Download the English tagger

# -----------------------------
# Sample travel reviews
# -----------------------------
df = pd.DataFrame({
    "Rating": [5, 2, 4, 5],
    "Price":  [120, 95, 200, 150],
    "Review": [
        "Great location, very clean!",
        "Not worth the price.",
        "Will definitely stay again.",
        "This hotel was the bomb!!! 😊"
    ]
})

# -----------------------------
# Config toggles (set to True/False as needed)
# -----------------------------
REMOVE_DIGITS = True

# -----------------------------
# Helpers
# -----------------------------
# Full(er) Unicode normalization:
# - NFKD decompose, drop combining marks (accents)
# - NFKC fold (compatibility)
def normalize_unicode(txt: str) -> str:
    if not isinstance(txt, str):
        return ""
    nfkd = unicodedata.normalize("NFKD", txt)
    no_marks = "".join(ch for ch in nfkd if not unicodedata.combining(ch))
    return unicodedata.normalize("NFKC", no_marks)

# Contraction expansion (extend as needed)
_contractions = {
    r"\bcan't\b": "cannot",
    r"\bwon't\b": "will not",
    r"n't\b": " not",
    r"'re\b": " are",
    r"'s\b": " is",
    r"'d\b": " would",
    r"'ll\b": " will",
    r"'t\b": " not",
    r"'ve\b": " have",
    r"'m\b": " am",
    r"\bit’s\b": "it is",
    r"\bit's\b": "it is",
    r"\bi’m\b": "i am",
    r"\bi'm\b": "i am",
}
_contr_re = [(re.compile(p, flags=re.IGNORECASE), r) for p, r in _contractions.items()]

def expand_contractions(txt: str) -> str:
    out = txt
    for cre, repl in _contr_re:
        out = cre.sub(repl, out)
    return out

# POS mapping for WordNet lemmatizer
def wn_pos(tag: str):
    if tag.startswith("J"): return wordnet.ADJ
    if tag.startswith("V"): return wordnet.VERB
    if tag.startswith("N"): return wordnet.NOUN
    if tag.startswith("R"): return wordnet.ADV
    return wordnet.NOUN

# Tiny illustrative naive normalizer (optional, for demo)
def naive_normalize_token(t: str) -> str:
    for suf in ("ing","ed","es","s"):
        if len(t) > 4 and t.endswith(suf):
            return t[:-len(suf)]
    return t

# -----------------------------
# Pipeline
# -----------------------------
clean = df.copy()

# 1) Unicode normalize + lowercase
clean["clean_text"] = clean["Review"].astype(str).apply(normalize_unicode).str.lower()

# 2) Contraction expansion
clean["clean_text"] = clean["clean_text"].apply(expand_contractions)

# 3) Remove punctuation/symbols (vectorized, regex=True)
clean["clean_text"] = clean["clean_text"].str.replace(r"[^\w\s]", " ", regex=True)

# 4) Optional digit removal
if REMOVE_DIGITS:
    clean["clean_text"] = clean["clean_text"].str.replace(r"\d+", " ", regex=True)

# 5) Collapse multiple spaces and trim
clean["clean_text"] = clean["clean_text"].str.replace(r"\s+", " ", regex=True).str.strip()

# 6) Tokenization (NLTK)
clean["tokens"] = clean["clean_text"].apply(nltk.word_tokenize)

# 7) Stopword removal
stop_words = set(stopwords.words("english"))
clean["tokens_no_stop"] = clean["tokens"].apply(lambda toks: [w for w in toks if w not in stop_words])

# 8) POS tagging on the filtered tokens
clean["pos_tags"] = clean["tokens_no_stop"].apply(nltk.pos_tag)

# 9) Lemmatization with POS
lemmatizer = WordNetLemmatizer()
def lemmatize_with_pos(toks):
    tagged = nltk.pos_tag(toks)
    return [lemmatizer.lemmatize(w, wn_pos(tag)) for w, tag in tagged]

clean["lemmatized"] = clean["tokens_no_stop"].apply(lemmatize_with_pos)

# 10) Convenience columns (as in earlier version)
clean["clean_no_stop"] = clean["tokens_no_stop"].apply(lambda toks: " ".join(toks))
clean["tokens_normalized"] = clean["tokens_no_stop"].apply(lambda toks: [naive_normalize_token(t) for t in toks])

# Final preview
print(clean[[
    "Review",
    "clean_text",
    "tokens",
    "tokens_no_stop",
    "lemmatized",
    "pos_tags",
    "clean_no_stop",
    "tokens_normalized"
]])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


                         Review                  clean_text  \
0   Great location, very clean!   great location very clean   
1          Not worth the price.         not worth the price   
2   Will definitely stay again.  will definitely stay again   
3  This hotel was the bomb!!! 😊     this hotel was the bomb   

                            tokens            tokens_no_stop  \
0   [great, location, very, clean]  [great, location, clean]   
1         [not, worth, the, price]            [worth, price]   
2  [will, definitely, stay, again]        [definitely, stay]   
3    [this, hotel, was, the, bomb]             [hotel, bomb]   

                 lemmatized                                    pos_tags  \
0  [great, location, clean]  [(great, JJ), (location, NN), (clean, NN)]   
1            [worth, price]                  [(worth, JJ), (price, NN)]   
2        [definitely, stay]              [(definitely, RB), (stay, VB)]   
3             [hotel, bomb]                   [(hotel, NN), (bo