In [40]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [41]:
df = pd.read_csv("/content/drive/MyDrive/Sentiment Analysis/twitter_training.csv", header=None)
df.columns = ["id", "topic", "sentiment", "text"]

# Use only sentiment + text
df = df[["sentiment", "text"]]

print("Sample Data:")
print(df.head())


Sample Data:
  sentiment                                               text
0  Positive  im getting on borderlands and i will murder yo...
1  Positive  I am coming to the borders and I will kill you...
2  Positive  im getting on borderlands and i will kill you ...
3  Positive  im coming on borderlands and i will murder you...
4  Positive  im getting on borderlands 2 and i will murder ...


In [42]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove links, mentions, hashtags
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+|\#','', text)
    # Remove punctuation & numbers
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Tokenize and remove stopwords + lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    return " ".join(tokens)

df["clean_text"] = df["text"].astype(str).apply(clean_text)

In [43]:
X = df["clean_text"]
y = df["sentiment"]

vectorizer = TfidfVectorizer(max_features=5000)
X_vec = vectorizer.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42, stratify=y)

In [44]:
# Naive Bayes
nb = MultinomialNB()
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_test)

# SVM
svm = LinearSVC()
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)

# KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)

In [45]:
print("\n=== Naive Bayes Report ===")
print(classification_report(y_test, nb_pred))
print("Accuracy:", accuracy_score(y_test, nb_pred))

print("\n=== SVM Report ===")
print(classification_report(y_test, svm_pred))
print("Accuracy:", accuracy_score(y_test, svm_pred))

print("\n=== KNN Report ===")
print(classification_report(y_test, knn_pred))
print("Accuracy:", accuracy_score(y_test, knn_pred))


=== Naive Bayes Report ===
              precision    recall  f1-score   support

  Irrelevant       0.73      0.35      0.47      2598
    Negative       0.63      0.80      0.70      4509
     Neutral       0.66      0.55      0.60      3664
    Positive       0.62      0.73      0.67      4166

    accuracy                           0.64     14937
   macro avg       0.66      0.61      0.61     14937
weighted avg       0.65      0.64      0.63     14937

Accuracy: 0.6408247974827609

=== SVM Report ===
              precision    recall  f1-score   support

  Irrelevant       0.71      0.57      0.63      2598
    Negative       0.72      0.80      0.76      4509
     Neutral       0.70      0.66      0.68      3664
    Positive       0.70      0.73      0.72      4166

    accuracy                           0.71     14937
   macro avg       0.71      0.69      0.70     14937
weighted avg       0.71      0.71      0.70     14937

Accuracy: 0.7073040101760728

=== KNN Report ===
    

In [48]:
def predict_sentiment(text):
    clean = clean_text(text)
    vec = vectorizer.transform([clean])
    pred = svm.predict(vec)[0]  # Using SVM as final model
    return pred

print("\nReal-time Test Predictions:")
print(predict_sentiment(input()))


Real-time Test Predictions:
this was the worst product i have bought
Negative


In [49]:
import joblib

# Save the trained model (using SVM here, but you can choose Naive Bayes or KNN too)
joblib.dump(svm, "sentiment_model.joblib")

# Save the TF-IDF vectorizer
joblib.dump(vectorizer, "vectorizer.joblib")

print("✅ Model and vectorizer saved successfully!")


✅ Model and vectorizer saved successfully!
