Requirements

In [77]:
# %pip install gensim
# % pip install pandas==1.5.3
# % pip install scikit-learn==1.0.2
# % pip install gensim==4.1.2
# % pip install nltk==3.6.7
# % pip install numpy==1.21.5

Step 1: Import Required Libraries

In [78]:
import pandas as pd
from sklearn.model_selection import train_test_split    # for 2 split - train test
from sklearn.feature_extraction.text import CountVectorizer  # For bag of words
from sklearn.naive_bayes import MultinomialNB  # Naive bayes models
from sklearn.model_selection import GridSearchCV        # For Hyperparameters(finding best params)
from sklearn.metrics import accuracy_score, classification_report # for evaluation
import re # regualr expression for removing punctuations and stuff
from nltk.corpus import stopwords       # for stopwords removal
from nltk.stem import WordNetLemmatizer # For lemmatization
# from sklearn.feature_extraction.text import TfidfVectorizer  # For TF-IDF vectoriser
# from nltk.stem import PorterStemmer     # For stemming
# from gensim.models import Word2Vec      # For Word2vec neural network based vectorizer

Step 2: Load the data (In my case CSV)

In [None]:
df = pd.read_csv("/home/GitHub/text_summarisation_trad_NLP/sentiments.csv")

# Should print : 'text', 'label' and the coulmns associated to them (Uncleaned data)
print(df.head())

                                               text     label
0             Enjoying a beautiful day at the park!  positive
1                Just finished an amazing workout!   positive
2       Excited about the upcoming weekend getaway!  positive
3   Feeling grateful for the little things in life.  positive
4  Rainy days call for cozy blankets and hot cocoa.  positive


Step 3: Data Cleaning > Lowercasing, punctuation and number removal, stopwords removal, lemmatization

In [80]:
# Load stop words -> we will use stopwords removal which would remove words like, the of
stop_words = set(stopwords.words('english'))

In [81]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

In [82]:
# Function to clean text
def clean_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize and remove stop words
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [83]:
# Apply the cleaning function to the text column
df['cleaned_text'] = df['text'].apply(clean_text)

Print the cleaned data for us to compare

In [84]:
print(df['cleaned_text'].head())

0              enjoying beautiful day park
1                 finished amazing workout
2         excited upcoming weekend getaway
3       feeling grateful little thing life
4    rainy day call cozy blanket hot cocoa
Name: cleaned_text, dtype: object


Step 4: Preprocess using Bag of Words vectorization (CountVectoriser)

In [85]:
# Converts text into numerical features
tfidf = CountVectorizer(stop_words='english', max_features=5000)

X = tfidf.fit_transform(df['cleaned_text'])   # Feature (1st column)
y = df['label']                       # Labels  (2nd column)

Step 5: Train test split

In [86]:
# Train 75 test 25
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=50)

Step 6 : Train the Naive Bayes Classifier

In [87]:
model = MultinomialNB()     # best for text classification

# Define the parameter grid > For multinomialNB
param_grid = {
    'class_prior': [None, [0.3, 0.4, 0.3], [0.2, 0.5, 0.3]],
    'alpha': [1.0, 0.5, 0.1],
    'fit_prior': [True, False]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)

# Fit the model
grid_search.fit(X_train, y_train)

# Best parameters
best_params = grid_search.best_params_
print("Best parameters found: ", grid_search.best_params_)

model = MultinomialNB(**best_params)
model.fit(X_train, y_train)

Best parameters found:  {'alpha': 0.1, 'class_prior': [0.3, 0.4, 0.3], 'fit_prior': True}


Step 7: Evaluate the model

In [88]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.85
Classification Report:
               precision    recall  f1-score   support

    negative       0.84      0.85      0.84       112
     neutral       0.81      0.78      0.80       111
    positive       0.89      0.91      0.90       137

    accuracy                           0.85       360
   macro avg       0.85      0.85      0.85       360
weighted avg       0.85      0.85      0.85       360



Some sample texts I have written to test model

In [89]:
# Positive
sample_text = ["Wow this spa really turned out to real good"]

More focused sentiments: Mixed sentiments, sarcasm, genz slang, extreme sentiments

In [90]:
sample_texts = [
    # Positive
    "This coffee shop has the most amazing atmosphere and perfect latte art!",
    "I'm absolutely loving how productive I've been this week - everything's falling into place!",
    "The sunset tonight was breathtaking, all pink and gold over the water.",

    # Neutral
    "The meeting has been rescheduled to Thursday at 2pm in Conference Room B.",
    "My grocery list contains milk, eggs, bread, and vegetables.",
    "The train to Chicago departs at 3:15pm from Platform 4.",

    # Negative
    "My flight got delayed 6 hours and now I'm stuck sleeping in this awful airport chair.",
    "The so-called 'fresh' fish I ordered tasted like it was caught last month.",
    "Nothing puts me in a great mood like getting 50 spam emails before breakfast.",

    # Mixed tone (positive start, negative ending)
    "The concert started great until someone spilled beer all over my new jacket.",
    "Loved the hotel room, but the construction noise started at 6am sharp.",

    # Long/complex examples
    "After waiting 45 minutes past my appointment time in a crowded waiting room with screaming children, the dentist told me they'd have to reschedule because they 'ran out of time",
    "The package arrived right on schedule and in perfect condition, with everything exactly as described - a rare and wonderful online shopping experience!",
    "According to the weather report, there's a 60% chance of rain this afternoon, with temperatures peaking around 72°F before dropping in the evening."
]

For bunch for samples

In [91]:
# Loop through each sentence in sample_texts, transform it, and make predictions
for text in sample_texts:
    # Transform the single text using the TF-IDF vectorizer
    sample_tfidf = tfidf.transform([text])  # use of a list to keep the format

    # Make prediction using the trained model
    sample_prediction = model.predict(sample_tfidf)

    # Print the prediction
    print(f"Text: '{text}' => Prediction: '{sample_prediction[0]}'")

Text: 'This coffee shop has the most amazing atmosphere and perfect latte art!' => Prediction: 'positive'
Text: 'I'm absolutely loving how productive I've been this week - everything's falling into place!' => Prediction: 'positive'
Text: 'The sunset tonight was breathtaking, all pink and gold over the water.' => Prediction: 'positive'
Text: 'The meeting has been rescheduled to Thursday at 2pm in Conference Room B.' => Prediction: 'neutral'
Text: 'My grocery list contains milk, eggs, bread, and vegetables.' => Prediction: 'neutral'
Text: 'The train to Chicago departs at 3:15pm from Platform 4.' => Prediction: 'neutral'
Text: 'My flight got delayed 6 hours and now I'm stuck sleeping in this awful airport chair.' => Prediction: 'negative'
Text: 'The so-called 'fresh' fish I ordered tasted like it was caught last month.' => Prediction: 'neutral'
Text: 'Nothing puts me in a great mood like getting 50 spam emails before breakfast.' => Prediction: 'positive'
Text: 'The concert started great u

Transforming the sample text using same TF-IDF transform function

In [92]:
sample_tfidf = tfidf.transform(sample_text)

Make predictions using the trained model

In [93]:
sample_predictions = model.predict(sample_tfidf)

Print the prediction

In [94]:
print(f"Text: '{sample_text[0]}'")
print(f"Predicted Sentiment: '{sample_predictions[0]}'")

Text: 'Wow this spa really turned out to real good'
Predicted Sentiment: 'positive'


In [None]:
import pickle

# Specify the path where you want to save the model
model_path = '/home/GitHub/text_summarisation_trad_NLP/sentiment_model.pkl'

# Save the model
with open(model_path, 'wb') as file:
    pickle.dump(model, file)

print(f"Model saved as {model_path}")

# Saving the vectoruser

# Save the fitted vectorizer to a .pkl file
vectorizer_path = '/home/GitHub/text_summarisation_trad_NLP/vectorizer.pkl'
with open(vectorizer_path, 'wb') as file:
    pickle.dump(tfidf, file)

print(f"Vectorizer saved as {vectorizer_path}")

Model saved as /home/sakhaglobal/Documents/GitHub/text_summarisation_trad_NLP/sentiment_model.pkl
Vectorizer saved as /home/sakhaglobal/Documents/GitHub/text_summarisation_trad_NLP/vectorizer.pkl
