<a href="https://colab.research.google.com/github/cicada0521/Finance/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score
import joblib
from google.colab import files
import io

# Download necessary NLTK data
nltk.download('stopwords')

print("Please upload your sentiment data CSV file...")
uploaded = files.upload()

# Get the uploaded file
file_name = next(iter(uploaded))
print(f"Successfully uploaded: {file_name}")

# Read the uploaded CSV file
dataset = pd.read_csv(io.BytesIO(uploaded[file_name]))

# Display info about the dataset
print("\nDataset Information:")
print(f"Shape: {dataset.shape}")
print("\nColumns:", dataset.columns.tolist())
print("\nFirst few rows:")
print(dataset.head())

# Identify the review column name
review_column = None
if 'Review' in dataset.columns:
    review_column = 'Review'
else:
    # Try to automatically identify review column by looking for text columns
    text_cols = [col for col in dataset.columns if dataset[col].dtype == 'object']
    if text_cols:
        review_column = text_cols[0]
        print(f"\nUsing '{review_column}' as the review column")

# Check if we have a valid review column
if review_column is None:
    print("Error: Could not identify a review column in your dataset.")
    print("Please make sure your dataset has a text column named 'Review' or similar.")
    raise ValueError("No valid review column found")

# Identify the sentiment/label column (should be the last column)
sentiment_column = dataset.columns[-1]
print(f"Using '{sentiment_column}' as the sentiment column")

# Text preprocessing
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')

corpus = []
for i in range(len(dataset)):
    # Use the identified review column
    review = re.sub('[^a-zA-Z]', ' ', str(dataset[review_column][i]))
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)

# Create the Bag of Words model
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

# Splitting the dataset into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

# Training the Naive Bayes model
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Save the model
joblib.dump(classifier, 'sentiment_classifier_model.joblib')
print("\nModel trained and saved as 'sentiment_classifier_model.joblib'")

# Download the model file
files.download('sentiment_classifier_model.joblib')

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)

accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")

# Function to predict sentiment for new reviews
def predict_sentiment(new_review):
    new_review = re.sub('[^a-zA-Z]', ' ', new_review)
    new_review = new_review.lower()
    new_review = new_review.split()
    new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
    new_review = ' '.join(new_review)
    new_corpus = [new_review]
    new_X = cv.transform(new_corpus).toarray()
    new_prediction = classifier.predict(new_X)
    return "Positive" if new_prediction[0] == 1 else "Negative"

# Example usage
print("\nExample predictions:")
print("'This product is amazing! I love it.' →", predict_sentiment("This product is amazing! I love it."))
print("'Terrible experience. Would not recommend.' →", predict_sentiment("Terrible experience. Would not recommend."))

# Interactive prediction
print("\n--- Sentiment Prediction Tool ---")
print("Type a review to analyze its sentiment (or 'exit' to quit):")

while True:
    user_input = input("\nEnter text: ")
    if user_input.lower() == 'exit':
        break
    result = predict_sentiment(user_input)
    print(f"Sentiment: {result}")

# Save the CountVectorizer for future use
joblib.dump(cv, 'count_vectorizer.joblib')
files.download('count_vectorizer.joblib')

print("\nCountVectorizer saved as 'count_vectorizer.joblib'")
print("\nYou can now use both files to make predictions on new data.")

Please upload your sentiment data CSV file...


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Saving sentimentdata.csv to sentimentdata.csv
Successfully uploaded: sentimentdata.csv

Dataset Information:
Shape: (900, 2)

Columns: ['Review', 'Liked']

First few rows:
                                              Review  Liked
0                           Wow... Loved this place.      1
1                                 Crust is not good.      0
2          Not tasty and the texture was just nasty.      0
3  Stopped by during the late May bank holiday of...      1
4  The selection on the menu was great and so wer...      1
Using 'Liked' as the sentiment column

Model trained and saved as 'sentiment_classifier_model.joblib'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Confusion Matrix:
[[68 10]
 [50 52]]

Accuracy: 0.6667

Example predictions:
'This product is amazing! I love it.' → Negative
'Terrible experience. Would not recommend.' → Negative

--- Sentiment Prediction Tool ---
Type a review to analyze its sentiment (or 'exit' to quit):

Enter text: Not tasty and the texture was just nasty
Sentiment: Negative

Enter text: exit


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


CountVectorizer saved as 'count_vectorizer.joblib'

You can now use both files to make predictions on new data.
