In [10]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score
from textblob import TextBlob

# Load the dataset into a pandas DataFrame
data = pd.read_csv('Amazonin1.csv')

# Remove unnecessary columns and missing values
data = data[['Tweet', 'Likes', 'Retweets', 'Language']]

# Preprocess the text data
nltk.download('stopwords')
stopwords = stopwords.words('english')
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.split()
    text = [word for word in text if word not in stopwords]
    text = [lemmatizer.lemmatize(word) for word in text]
    text = ' '.join(text)
    return text

data['Tweet'] = data['Tweet'].apply(preprocess_text)

# Define the function to get the sentiment of a tweet
def get_sentiment(tweet):
    analysis = TextBlob(tweet)
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity == 0:
        return 'neutral'
    else:
        return 'negative'

data['Sentiment'] = data['Tweet'].apply(get_sentiment)


# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['Tweet'], data['Sentiment'], test_size=0.2, random_state=42)

# Convert the raw tweets to bag-of-words vectors
vectorizer = CountVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x)
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

# Train a Naive Bayes classifier on the training data
model = MultinomialNB()
model.fit(X_train_counts, y_train)

# Evaluate the model on the testing data
y_pred = model.predict(X_test_counts)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
print(f'Accuracy: {acc:.4f} | Precision: {prec:.4f} | Recall: {recall:.4f}\n')




[nltk_data] Downloading package stopwords to C:\Users\Hp 840
[nltk_data]     G5\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 0.5970 | Precision: 0.5895 | Recall: 0.5970

