In [1]:
import pandas as pd
from textblob import TextBlob

In [2]:
pd.set_option('display.max_colwidth', None)

# Data

In [3]:
# load file
df = pd.read_csv('../data/Tweets.csv')
df.sample(10)

Unnamed: 0,textID,text,selected_text,sentiment
462,7e66ca1a42,but you always have lee. Let`s go to Paris,but you always have lee. Let`s go to Paris,neutral
15775,fd987ed435,Surgery.,Surgery.,neutral
24340,f6f69e5073,RIP Michael,RIP Michael,negative
2475,5a345a2602,"Adem in, adem uit","Adem in, adem uit",neutral
21430,a92db139c6,one of them is,one of them is,neutral
16634,64c8534abc,Feel sorry for Adam Cook. Be strong for David and family,sorry,negative
18921,173b6b6ee3,at the store! Lol I don`t have any liquour here,at the store! Lol I don`t have any liquour here,neutral
17996,32f4183741,Horrid dream. I suspect I will have to cancel my plans tonight...,Horrid dream.,negative
4930,b924039252,my sister is a douchebag,douchebag,negative
5560,9dd013a94b,will have to wait on the recipe at Simply Recipes. Sorries!,. Sorrie,negative


# Data Cleaning

In [4]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    if text is not None and isinstance(text, str):
        # Convert to lowercase
        text = text.lower()

        # Remove URLs
        text = re.sub(r'https?://\S+|www\.\S+', '', text)

        # Remove usernames starting with '@'
        text = re.sub(r'@\w+', '', text)

        # Remove hashtags starting with '#'
        text = re.sub(r'#\w+', '', text)

        # Remove non-alphabetic characters
        text = re.sub('[^a-zA-Z]', ' ', text)

        # Lemmatize words
        lemmatizer = WordNetLemmatizer()
        text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

        # Remove words with 3 characters or less
        text = ' '.join([word for word in text.split() if len(word) > 3])

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        text = ' '.join([word for word in text.split() if word not in stop_words])

        # Remove duplicate words while preserving the order
        words = text.split()
        text = ' '.join(list(dict.fromkeys(words)))

    else:
        text = ''

    return text

In [5]:
# Apply the clean_text function to the 'raw_sentence' column of the dataframe
df['clean_text'] = df['text'].apply(clean_text)
df[['text', 'clean_text']].sample(10)

Unnamed: 0,text,clean_text
9908,having my hair dyed today ugh im bored. still tired from friday lol. swear down bossman ;),hair dyed today bored still tired friday swear bossman
10936,Going to workout + swin... fun,going workout swin
17969,Burp the Frog http://is.gd/rae9,burp frog
22393,I do the same thing to anybody covering dave... haha... we don`t like people messing with perfect music.,thing anybody covering dave haha like people messing perfect music
509,Nyappy mother`s day to your mom`s.,nyappy mother
12332,"on myyearbook, myspace, here and messenger",myyearbook myspace messenger
19573,"No, it`s people you are recommending that others follow, like followfriday, only with pics!",people recommending others follow like followfriday
26796,horseback riding,horseback riding
22060,jerk josh! didn`t even come meet me - im thinking of a number guess ?,jerk josh even come meet thinking number guess
19976,Loads of Beard papas have disappeared in the UK too,load beard papa disappeared


# Sentiment Analysis

In [6]:
# Define a function to get the sentiment polarity score
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

# Apply the function to the 'clean_text' column and round the result to 2 decimal places
df['textblob_polarity'] = df['clean_text'].apply(get_sentiment).round(2)

# Define a function to categorize the sentiment polarity score into 4 categories
def categorize_sentiment(score):
    if score >= 0.5:
        return 'Positive'
    elif score >= 0.05 and score < 0.5:
        return 'Moderately Positive'
    elif score > -0.05 and score < 0.05:
        return 'Neutral'
    elif score > -0.5 and score <= -0.05:
        return 'Moderately Negative'
    else:
        return 'Negative'

# Apply the categorize_sentiment function to the 'textblob_polarity' column
df['sentiment_textblob'] = df['textblob_polarity'].apply(categorize_sentiment)

# Select the relevant columns for display
df[['clean_text', 'textblob_polarity', 'sentiment_textblob']].sample(10)

Unnamed: 0,clean_text,textblob_polarity,sentiment_textblob
15569,thought wallace gromit team behind monkey island could combined disastrously,-0.38,Moderately Negative
5078,wish school year would faster move life,0.0,Neutral
1527,always pretty athletic especially love ball anyway yeah,0.62,Positive
12820,suck went jail couldnt tweet anymore,-0.1,Moderately Negative
4552,ummm didnt work guess stuck uglyonee,0.0,Neutral
9902,updating live benihana tokyo waikiki happy birthday mark,0.47,Moderately Positive
413,sure hope becomes afternoon,0.5,Positive
2040,finally shifted twhirl tweetdeck filter close friend update happy monday peep,0.4,Moderately Positive
14458,girl aidan,0.0,Neutral
16903,reading book sunshine goona good,0.7,Positive


In [7]:
df.dropna(inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27480 entries, 0 to 27480
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   textID              27480 non-null  object 
 1   text                27480 non-null  object 
 2   selected_text       27480 non-null  object 
 3   sentiment           27480 non-null  object 
 4   clean_text          27480 non-null  object 
 5   textblob_polarity   27480 non-null  float64
 6   sentiment_textblob  27480 non-null  object 
dtypes: float64(1), object(6)
memory usage: 1.7+ MB


In [9]:
sentiment_counts = df['sentiment_textblob'].value_counts()
sentiment_counts

Neutral                12071
Moderately Positive     6594
Positive                4433
Moderately Negative     2923
Negative                1459
Name: sentiment_textblob, dtype: int64

# Balancing Sentiment Data

In [10]:
from imblearn.over_sampling import RandomOverSampler

# Extract the sentiment labels and features from your DataFrame
labels = df['sentiment_textblob']
features = df['clean_text']

# Create an instance of the RandomOverSampler
oversampler = RandomOverSampler(random_state=42)

# Apply the oversampling to balance the classes
features_balanced, labels_balanced = oversampler.fit_resample(features.values.reshape(-1, 1), labels)

# Convert the balanced features and labels back to a DataFrame
balanced_df = pd.DataFrame({'clean_text': features_balanced.flatten(), 'sentiment_textblob': labels_balanced})

# Display the balanced sentiment counts
balanced_sentiment_counts = balanced_df['sentiment_textblob'].value_counts()
print(balanced_sentiment_counts)

Neutral                12071
Positive               12071
Moderately Positive    12071
Negative               12071
Moderately Negative    12071
Name: sentiment_textblob, dtype: int64


# Feature Extraction

In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix


# Assuming your balanced DataFrame is named 'balanced_df'
# Extract the balanced features and labels
features = balanced_df['clean_text']
labels = balanced_df['sentiment_textblob']

# Create an instance of the TfidfVectorizer with n-grams
vectorizer = TfidfVectorizer(ngram_range=(1, 2))

# Fit and transform the features
features_transformed = vectorizer.fit_transform(features)

# Convert the transformed features to a sparse matrix
features_sparse = csr_matrix(features_transformed)

# Create a DataFrame from the sparse matrix
features_df = pd.DataFrame.sparse.from_spmatrix(features_sparse, columns=vectorizer.get_feature_names_out())

# Concatenate the features DataFrame with the labels
data = pd.concat([features_df, labels], axis=1)

# Display the data
print(data.head())


   aaaa  aaaa cant  aaaa need  aaaaaaaaaaa  aaaaaaaaaaa mcfly  \
0   0.0        0.0        0.0          0.0                0.0   
1   0.0        0.0        0.0          0.0                0.0   
2   0.0        0.0        0.0          0.0                0.0   
3   0.0        0.0        0.0          0.0                0.0   
4   0.0        0.0        0.0          0.0                0.0   

   aaaaaaaaaahhhhhhhh  aaaaaaaaaahhhhhhhh gonna  aaaaaaaaaamazing  \
0                 0.0                       0.0               0.0   
1                 0.0                       0.0               0.0   
2                 0.0                       0.0               0.0   
3                 0.0                       0.0               0.0   
4                 0.0                       0.0               0.0   

   aaaaaaaaaamazing trip  aaaaaaaafternoon  ...  zyrtec sleep  zzzz  \
0                    0.0               0.0  ...           0.0   0.0   
1                    0.0               0.0  ...     

In [12]:
import pickle

# Save the trained model
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)

# Splitting the Data

In [13]:
from sklearn.model_selection import train_test_split

# Assuming your data is stored in the 'data' DataFrame
# Extract the features and labels
features = data.drop('sentiment_textblob', axis=1)  # Assuming the features are stored in columns other than the sentiment column
labels = data['sentiment_textblob']

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.2, random_state=42)

# Print the shapes of the split datasets
print("Train Features Shape:", train_features.shape)
print("Train Labels Shape:", train_labels.shape)
print("Test Features Shape:", test_features.shape)
print("Test Labels Shape:", test_labels.shape)


Train Features Shape: (48284, 116930)
Train Labels Shape: (48284,)
Test Features Shape: (12071, 116930)
Test Labels Shape: (12071,)


# Model Selection

In [14]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Create an instance of the Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the classifier using the training data
nb_classifier.fit(train_features, train_labels)

# Make predictions on the test data
predictions = nb_classifier.predict(test_features)

# Evaluate the performance of the classifier
print(classification_report(test_labels, predictions))

                     precision    recall  f1-score   support

Moderately Negative       0.86      0.98      0.92      2493
Moderately Positive       0.87      0.88      0.87      2436
           Negative       0.85      1.00      0.92      2356
            Neutral       0.96      0.52      0.67      2368
           Positive       0.87      0.97      0.91      2418

           accuracy                           0.87     12071
          macro avg       0.88      0.87      0.86     12071
       weighted avg       0.88      0.87      0.86     12071



# Model Training

In [15]:
from sklearn.naive_bayes import MultinomialNB

# Create an instance of the Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the classifier using the training data
nb_classifier.fit(train_features, train_labels)


# Model Evaluation

In [16]:
from sklearn.metrics import classification_report

# Make predictions on the test data
predictions = nb_classifier.predict(test_features)

# Evaluate the performance of the classifier
print(classification_report(test_labels, predictions))


                     precision    recall  f1-score   support

Moderately Negative       0.86      0.98      0.92      2493
Moderately Positive       0.87      0.88      0.87      2436
           Negative       0.85      1.00      0.92      2356
            Neutral       0.96      0.52      0.67      2368
           Positive       0.87      0.97      0.91      2418

           accuracy                           0.87     12071
          macro avg       0.88      0.87      0.86     12071
       weighted avg       0.88      0.87      0.86     12071



# Fine-tuning and Optimization

1. Hyperparameter Tuning
2. Feature Engineering
3. Model Selection
4. Data Augmentation

# Prediction

In [17]:
from sklearn.metrics import classification_report

# Preprocess the features of the new data
new_data = "this is my first time to make sentiment analyzer"
new_data_features = vectorizer.transform([new_data])

# Make predictions on the preprocessed features using the trained model
predictions = nb_classifier.predict(new_data_features)



In [18]:
predictions

array(['Moderately Positive'], dtype='<U19')

# Save the Model

In [19]:
import pickle

# Save the trained model
with open('naive_bayes_model.pkl', 'wb') as file:
    pickle.dump(nb_classifier, file)

# Load the Model

In [20]:
import pickle

# Load the saved model
with open('naive_bayes_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Get user input for the text
new_sentence = input("Enter the text for sentiment prediction: ")

# Preprocess the user input
cleaned_sentence = clean_text(new_sentence)

# Vectorize the preprocessed sentence
new_sentence_features = vectorizer.transform([cleaned_sentence])

# Make predictions
predictions = loaded_model.predict(new_sentence_features)

# Print the user input and predicted sentiment
print("Your sentence:", new_sentence)
print("Predicted sentiment:", predictions)


Enter the text for sentiment prediction: hello
Your sentence: hello
Predicted sentiment: ['Neutral']
