In [None]:
# Sentiment Analysis with NLP
# TF-IDF + Logistic Regression Pipeline

# 1. Import Required Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import re
import string
import joblib
import os
import nltk

# Download NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

print("All libraries imported successfully!")

In [None]:
# 2. Load Dataset
print("Downloading dataset...")
dataset_url = "https://raw.githubusercontent.com/tristanga/sentiment-analysis/master/amazon_cells_labelled.txt"

try:
    df = pd.read_csv(dataset_url, sep='\t', header=None, names=['review', 'sentiment'])
    print(f"Dataset loaded with {len(df)} reviews\n")
except Exception as e:
    print(f"Error loading dataset: {e}")
    print("Using backup dataset...")
    data = {
        'review': [
            "This product is amazing! I love it",
            "Terrible experience, would not buy again",
            "Average product, nothing special",
            "Worth every penny, excellent quality",
            "Broken on arrival, very disappointed"
        ],
        'sentiment': [1, 0, 1, 1, 0]
    }
    df = pd.DataFrame(data)
    print("Created backup dataset with sample reviews")

# Preview data
print("First 5 reviews:")
print(df.head())

In [None]:
# 3. Text Preprocessing
print("\nCleaning text data...")
stop_words = set(nltk.corpus.stopwords.words('english'))
lemmatizer = nltk.stem.WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['cleaned_review'] = df['review'].apply(clean_text)

print("\nSample cleaned reviews:")
for i in range(min(3, len(df))):
    print(f"Original: {df['review'].iloc[i]}")
    print(f"Cleaned: {df['cleaned_review'].iloc[i]}\n")

In [None]:
# 4. TF-IDF Vectorization
print("Creating TF-IDF features...")
tfidf = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    ngram_range=(1, 2)
)
tfidf_features = tfidf.fit_transform(df['cleaned_review'])
print(f"Created {tfidf_features.shape[1]} features from reviews")

In [None]:
# 5. Train-Test Split
X = tfidf_features
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print(f"\nTraining samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")

In [None]:
# 6. Train Logistic Regression Model
print("\nTraining Logistic Regression model...")
model = LogisticRegression(
    max_iter=1000,
    solver='lbfgs',
    random_state=42
)
model.fit(X_train, y_train)
print("Model training completed!")

In [None]:
# 7. Model Evaluation
y_pred = model.predict(X_test)

print("\nModel Evaluation:")
print("-----------------")
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2%}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# 8. Show Important Features
print("\nTop Positive Words:")
feature_names = tfidf.get_feature_names_out()
coef = model.coef_[0]
positive_words = sorted(zip(feature_names, coef), key=lambda x: x[1], reverse=True)[:10]
for word, score in positive_words:
    print(f"{word}: {score:.4f}")

print("\nTop Negative Words:")
negative_words = sorted(zip(feature_names, coef), key=lambda x: x[1])[:10]
for word, score in negative_words:
    print(f"{word}: {score:.4f}")

In [None]:
# 9. Save Model and Create Folder Structure
print("\nSaving model artifacts...")
os.makedirs('models', exist_ok=True)

# Create placeholder file to preserve folder structure
with open('models/.gitkeep', 'w') as f:
    f.write('Preserving folder structure for GitHub')

# Save models
joblib.dump(model, 'models/sentiment_model.pkl')
joblib.dump(tfidf, 'models/tfidf_vectorizer.pkl')
print("Model saved to 'models/' directory")

In [None]:
# 10. Generate Requirements
# Create requirements.txt manually since !pip commands might not work in all environments
requirements = """pandas==2.2.1
scikit-learn==1.4.2
nltk==3.8.1
joblib==1.3.2
numpy==1.26.4
matplotlib==3.8.3"""

with open('requirements.txt', 'w') as f:
    f.write(requirements)
    
print("\nrequirements.txt generated")
print("\nAll tasks completed successfully!")