In [1]:
import os
import pandas as pd
import torch
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OneVsRestClassifier
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments


In [2]:
def load_and_preprocess_data(file_path):
    df = pd.read_excel(file_path)
    df.dropna(subset=['Reviews', 'Rating'], inplace=True)
    df['Categories'] = df['Categories'].apply(lambda x: x.split(',') if isinstance(x, str) else [])
    df['Activities'] = df['Activities'].apply(lambda x: x.split(',') if isinstance(x, str) else [])
    return df

# Example usage
file_path = "/content/ML SCE.xlsx"  # Replace with your actual file path
df = load_and_preprocess_data(file_path)
print("Dataset loaded and preprocessed successfully!")


Dataset loaded and preprocessed successfully!


In [3]:
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, reviews, labels, tokenizer, max_length):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = self.reviews[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            review,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def fine_tune_sentiment_model(df, model_name='distilbert-base-uncased-finetuned-sst-2-english'):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)

    reviews = df['Reviews'].tolist()
    ratings = df['Rating'].tolist()
    labels = [1 if rating >= 4 else 0 for rating in ratings]

    train_reviews, val_reviews, train_labels, val_labels = train_test_split(reviews, labels, test_size=0.2, random_state=42)

    train_dataset = ReviewDataset(train_reviews, train_labels, tokenizer, max_length=128)
    val_dataset = ReviewDataset(val_reviews, val_labels, tokenizer, max_length=128)

    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    trainer.train()
    return model, tokenizer

# Train the sentiment model
sentiment_model, tokenizer = fine_tune_sentiment_model(df)
print("Sentiment analysis model fine-tuned successfully!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
10,0.5133
20,0.3533
30,0.2303
40,0.3231
50,0.3211


Sentiment analysis model fine-tuned successfully!


In [4]:
def train_multilabel_classifiers(df):
    mlb_categories = MultiLabelBinarizer()
    mlb_activities = MultiLabelBinarizer()

    y_categories = mlb_categories.fit_transform(df['Categories'])
    y_activities = mlb_activities.fit_transform(df['Activities'])

    vectorizer = TfidfVectorizer(max_features=5000)
    X_tfidf = vectorizer.fit_transform(df['Reviews'])

    classifier_categories = OneVsRestClassifier(LogisticRegression(max_iter=1000))
    classifier_activities = OneVsRestClassifier(LogisticRegression(max_iter=1000))

    classifier_categories.fit(X_tfidf, y_categories)
    classifier_activities.fit(X_tfidf, y_activities)

    return vectorizer, classifier_categories, classifier_activities, mlb_categories, mlb_activities

# Train the multi-label classifiers
vectorizer, classifier_categories, classifier_activities, mlb_categories, mlb_activities = train_multilabel_classifiers(df)
print("Multi-label classifiers trained successfully!")


Multi-label classifiers trained successfully!


In [5]:
def save_models(sentiment_model, tokenizer, vectorizer, classifier_categories, classifier_activities, mlb_categories, mlb_activities, save_dir):
    os.makedirs(save_dir, exist_ok=True)

    sentiment_model.save_pretrained(os.path.join(save_dir, 'sentiment_model'))
    tokenizer.save_pretrained(os.path.join(save_dir, 'tokenizer'))

    joblib.dump(vectorizer, os.path.join(save_dir, 'vectorizer.joblib'))
    joblib.dump(classifier_categories, os.path.join(save_dir, 'classifier_categories.joblib'))
    joblib.dump(classifier_activities, os.path.join(save_dir, 'classifier_activities.joblib'))
    joblib.dump(mlb_categories, os.path.join(save_dir, 'mlb_categories.joblib'))
    joblib.dump(mlb_activities, os.path.join(save_dir, 'mlb_activities.joblib'))

def load_models(load_dir):
    sentiment_model = AutoModelForSequenceClassification.from_pretrained(os.path.join(load_dir, 'sentiment_model'))
    tokenizer = AutoTokenizer.from_pretrained(os.path.join(load_dir, 'tokenizer'))

    vectorizer = joblib.load(os.path.join(load_dir, 'vectorizer.joblib'))
    classifier_categories = joblib.load(os.path.join(load_dir, 'classifier_categories.joblib'))
    classifier_activities = joblib.load(os.path.join(load_dir, 'classifier_activities.joblib'))
    mlb_categories = joblib.load(os.path.join(load_dir, 'mlb_categories.joblib'))
    mlb_activities = joblib.load(os.path.join(load_dir, 'mlb_activities.joblib'))

    return sentiment_model, tokenizer, vectorizer, classifier_categories, classifier_activities, mlb_categories, mlb_activities

# Save models
save_directory = "MyModel"
save_models(sentiment_model, tokenizer, vectorizer, classifier_categories, classifier_activities, mlb_categories, mlb_activities, save_directory)
print("Models saved successfully!")

# Load models
loaded_models = load_models(save_directory)
print("Models loaded successfully!")


Models saved successfully!
Models loaded successfully!


In [10]:
 def analyze_review(review, sentiment_model, tokenizer, vectorizer, classifier_categories, classifier_activities, mlb_categories, mlb_activities):
    # Sentiment Analysis
    inputs = tokenizer(review, return_tensors="pt", truncation=True, padding=True, max_length=128)
    outputs = sentiment_model(**inputs)
    sentiment_score = torch.softmax(outputs.logits, dim=1)[0][1].item()
    sentiment = "Positive" if sentiment_score >= 0.5 else "Negative"

    # TF-IDF Vectorization
    X_tfidf = vectorizer.transform([review])

    # Predict Categories and Activities
    predicted_categories = classifier_categories.predict(X_tfidf)
    predicted_activities = classifier_activities.predict(X_tfidf)

    categories = mlb_categories.inverse_transform(predicted_categories)[0]
    activities = mlb_activities.inverse_transform(predicted_activities)[0]

    return {
        "sentiment": sentiment,
        "sentiment_score": sentiment_score,
        "categories": list(categories),
        "activities": list(activities)
    }

# Test the model with a review
review = '''review by navinya a rating ganpatipule temple is not just a religious site but also a beautiful destination for those seeking tranquility and connection with nature its a mustvisit for anyone traveling through maharashtra
  the ambiance is peaceful making it an ideal spot for meditation and reflection local devotees and tourists alike contribute to a warm welcoming atmosphere additionally the nearby beach provides a perfect opportunity for relaxation after visiting the temple review by sam panthaki rating this temple is located on ganpatipule beach
  next to temple we see the ganpatipule beach and it also has multiple water activities being held such as banana ride jet ski boat parasailing etc

  the place is beautiful and calm the temple gives quite positive and religious vibes and calms you

  a visit to ganpatipule temple offers not only a religious experience but also a chance to appreciate the natural beauty that complements its sacred ambiance review by soham alekari rating ganpatipule is a very nice place you can visit in any month ganpatipule is also famous for its beach the sea is very clean you can get mahaprasad at pm to pm the ladu prasad stall opens at am to pm review by prashant rane rating he temple houses a selfmanifested swayambhu idol of lord ganesha which is believed to have naturally emerged from the earth this makes the idol particularly sacred to devotees

  the deity worshiped here is known as paschim dwar devata meaning the western sentinel god as it faces west this is rare as most ganapati idols face the east

  ganapatipule is a significant pilgrimage site drawing thousands of devotees especially during festivals like ganesh chaturthi and magh chaturthi the temple is believed to fulfill the wishes of its devotees and many visit it to seek blessings for prosperity and wellbeing

  the temple structure is simple yet aesthetically pleasing blending traditional konkani architecture with a serene coastal vibe the main temple is made of stone and is adorned with intricate carvings and decorations

  one unique feature of the temple is the pradakshina circumambulation path that circles both the temple and the hill behind it devotees walk along this path offering prayers and taking in the breathtaking views of the sea and surrounding landscape

  ganapatipule is wellconnected by road and can be easily reached from nearby cities like ratnagiri which is about km away the temple is accessible by car or bus and there are plenty of accommodation options nearby for visitors review by anil chaudhary rating ganesh temple on shores nice clean beach we had a darshan in minutes maybe we were lucky i was told it could take hours on other days'''
result = analyze_review(review, *loaded_models)
print("Review Analysis Result:", result)


Review Analysis Result: {'sentiment': 'Positive', 'sentiment_score': 0.999724805355072, 'categories': [" 'cultural']", "['religious'"], 'activities': [" 'chilling'", " 'cultural experience'", "['relaxing'"]}
