<a href="https://colab.research.google.com/github/detkartik/upgrad_assignment/blob/master/sentiment_based_recommendation_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Import Libraries

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate, train_test_split as surprise_train_test_split
from surprise.accuracy import rmse
from google.colab import drive
from surprise.model_selection import train_test_split

2. Load and Inspect Data

In [14]:
# Load your dataset
drive.mount('/content/drive')
data = pd.read_csv('/content/drive/MyDrive/sample30 (1).csv')

# Display first few rows and summary info
print(data.head())
print(data.info())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
                     id            brand  \
0  AV13O1A8GV-KLJ3akUyj  Universal Music   
1  AV14LG0R-jtxr-f38QfS         Lundberg   
2  AV14LG0R-jtxr-f38QfS         Lundberg   
3  AV16khLE-jtxr-f38VFn              K-Y   
4  AV16khLE-jtxr-f38VFn              K-Y   

                                          categories  \
0  Movies, Music & Books,Music,R&b,Movies & TV,Mo...   
1  Food,Packaged Foods,Snacks,Crackers,Snacks, Co...   
2  Food,Packaged Foods,Snacks,Crackers,Snacks, Co...   
3  Personal Care,Medicine Cabinet,Lubricant/Sperm...   
4  Personal Care,Medicine Cabinet,Lubricant/Sperm...   

                         manufacturer  \
0  Universal Music Group / Cash Money   
1                            Lundberg   
2                            Lundberg   
3                                 K-Y   
4                                 K-Y   

                      

3. Data Cleaning and Preprocessing

In [15]:
data = data.dropna(subset=['reviews_text', 'user_sentiment'])
data['reviews_text'] = data['reviews_text'].str.lower().str.replace('[^\w\s]', '')
data['user_sentiment'] = data['user_sentiment'].map({'Positive': 1, 'Negative': 0})


4. Text Processing and Feature Extraction

In [16]:
# Vectorize text data for sentiment analysis
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['reviews_text'])
y = data['user_sentiment']

# Example: TF-IDF Vectorization for content-based filtering
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
content_vectors = tfidf_vectorizer.fit_transform(data['name'] + ' ' + data['categories'])

5. Machine Learning Models (Sentiment Analysis)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
logreg_model = LogisticRegression()
logreg_model.fit(X_train, y_train)
y_pred = logreg_model.predict(X_test)
print('Logistic Regression Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Logistic Regression Accuracy: 0.9448333333333333
              precision    recall  f1-score   support

           0       0.79      0.67      0.73       653
           1       0.96      0.98      0.97      5347

    accuracy                           0.94      6000
   macro avg       0.88      0.83      0.85      6000
weighted avg       0.94      0.94      0.94      6000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


6. Recommendation System (Collaborative Filtering)

In [2]:
!pip install scikit-surprise



In [19]:


reader = Reader(rating_scale=(1, 5))

# Load the data from the DataFrame
data_surprise = Dataset.load_from_df(data[['reviews_username', 'name', 'reviews_rating']], reader)

# Split the data into training and testing sets
trainset, testset = train_test_split(data_surprise, test_size=0.25)

# Use the SVD algorithm
algo = SVD()

# Train the algorithm on the trainset
algo.fit(trainset)

# Test the algorithm on the testset
predictions = algo.test(testset)

# Compute and print the RMSE
print('RMSE:', rmse(predictions))

RMSE: 0.7678
RMSE: 0.7678475253054363


7. Content-Based Filtering

In [4]:


# Combine relevant features for content-based filtering
data['combined_features'] = data['name'] + " " + data['categories'] + " " + data['reviews_text']

# Vectorize the combined features
vectorizer = TfidfVectorizer(stop_words='english')
content_vectors = vectorizer.fit_transform(data['combined_features'])

# # Compute cosine similarity matrix
batch_size = 1000
num_rows = content_vectors.shape[0]
cosine_sim = np.zeros((num_rows, num_rows))  # Initialize cosine similarity matrix

# Compute cosine similarity in batches
for i in range(0, num_rows, batch_size):
    start_idx = i
    end_idx = min(i + batch_size, num_rows)
    batch_similarity = cosine_similarity(content_vectors[start_idx:end_idx], content_vectors)
    cosine_sim[start_idx:end_idx] = batch_similarity


In [5]:
def get_content_recommendations(product_id, cosine_sim=cosine_sim):
    try:
        idx = data.index[data['id'] == product_id][0]
    except IndexError:
        print(f"Product ID '{product_id}' not found in the dataset.")
        return None

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]  # Get top 5 similar products
    product_indices = [i[0] for i in sim_scores]
    return data['name'].iloc[product_indices]

# Example usage
print(get_content_recommendations('1'))

Product ID '1' not found in the dataset.
None


In [6]:
from flask import Flask, request, jsonify

app = Flask(__name__)

@app.route('/recommend-products', methods=['POST'])
def recommend_products():
    data = request.json
    product_id = data['product_id']

    # Call the recommendation function
    recommendations = get_content_recommendations(product_id)

    if recommendations:
        return jsonify({'recommendations': recommendations})
    else:
        return jsonify({'error': f"Product ID '{product_id}' not found."}), 404

if __name__ == '__main__':
    app.run(debug=True)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat
