In [1]:
import pandas as pd
from textblob import TextBlob

In [2]:
pd.set_option('display.max_colwidth', None)

In [3]:
# load file
df = pd.read_csv('../data/Tweets.csv')
df.tail(10)

Unnamed: 0,textID,text,selected_text,sentiment
27471,15bb120f57,"i`m defying gravity. and nobody in alll of oz, no wizard that there is or was, is ever gonna bring me down","i`m defying gravity. and nobody in alll of oz, no wizard that there is or was, is ever gonna bring me down",neutral
27472,8f5adc47ec,http://twitpic.com/663vr - Wanted to visit the animals but we were too late,were too late,negative
27473,a208770a32,in spoke to you yesterday and u didnt respond girl wassup though!,in spoke to you yesterday and u didnt respond girl wassup though!,neutral
27474,8f14bb2715,So I get up early and I feel good about the day. I walk to work and I`m feeling alright. But guess what... I don`t work today.,I feel good ab,positive
27475,b78ec00df5,enjoy ur night,enjoy,positive
27476,4eac33d1c0,wish we could come see u on Denver husband lost his job and can`t afford it,d lost,negative
27477,4f4c4fc327,"I`ve wondered about rake to. The client has made it clear .NET only, don`t force devs to learn a new lang #agile #ccnet",", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - you probably need it after such hectic weekend Take care hun xxxx,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive
27480,6f7127d9d7,All this flirting going on - The ATG smiles. Yay. ((hugs)),All this flirting going on - The ATG smiles. Yay. ((hugs),neutral


In [4]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    if text is not None and isinstance(text, str):
        # Convert to lowercase
        text = text.lower()

        # Remove URLs
        text = re.sub(r'https?://\S+|www\.\S+', '', text)

        # Remove non-alphabetic characters
        text = re.sub('[^a-zA-Z]', ' ', text)

        # Lemmatize words
        lemmatizer = WordNetLemmatizer()
        text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

        # Remove words with 3 characters or less
        text = ' '.join([word for word in text.split() if len(word) > 3])

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        text = ' '.join([word for word in text.split() if word not in stop_words])

        # Remove duplicate words while preserving the order
        words = text.split()
        text = ' '.join(list(dict.fromkeys(words)))

    else:
        text = ''

    return text


In [5]:
# Apply the clean_text function to the 'raw_sentence' column of the dataframe
df['clean_text'] = df['text'].apply(clean_text)
df[['text', 'clean_text']].sample(10)

Unnamed: 0,text,clean_text
13592,Poor Johnny is sick Looks like he`ll be going to the vet tomorrow,poor johnny sick look like going tomorrow
2431,the trailer has been removed i have a look and see if i can find it somewhere,trailer removed look find somewhere
9388,Phew long day and i havent gotten to work yet,phew long havent gotten work
1590,Just left you a message! Hope you get it,left message hope
2548,I`m sure all you people with hangovers will be glad to know I`m hangover free hope you`re not too fragile this morning!,sure people hangover glad know free hope fragile morning
20458,"yeah man, they brought it back on the market a couple years ago...it scarce through",yeah brought back market couple year scarce
20012,"Yeah, I wanted to have some I was bottling, so it`s sanitized and all good. It tastes like those choc. oranges.",yeah wanted bottling sanitized good taste like choc orange
21227,There isn`t any right now. They need to make more. Sorry.,right need make sorry
19393,Congrats ...so proud of you girl,congrats proud girl
13120,"We`re English (well in parts), it`s Weather, it IS an obsession",english well part weather obsession


In [6]:
# Define a function to get the sentiment polarity score
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

# Apply the function to the 'clean_text' column and round the result to 2 decimal places
df['textblob_polarity'] = df['clean_text'].apply(get_sentiment).round(2)

# Define a function to categorize the sentiment polarity score into 4 categories
def categorize_sentiment(score):
    if score >= 0.5:
        return 'Positive'
    elif score >= 0.05 and score < 0.5:
        return 'Partially Positive'
    elif score > -0.05 and score < 0.05:
        return 'Neutral'
    elif score > -0.5 and score <= -0.05:
        return 'Partially Negative'
    else:
        return 'Negative'

# Apply the categorize_sentiment function to the 'textblob_polarity' column
df['sentiment_textblob'] = df['textblob_polarity'].apply(categorize_sentiment)

# Select the relevant columns for display
df[['clean_text', 'textblob_polarity', 'sentiment_textblob']].sample(10)

Unnamed: 0,clean_text,textblob_polarity,sentiment_textblob
4522,send thru anurag,0.0,Neutral
1593,sleeeep good nice night comfy,0.65,Positive
24582,mean leaving,-0.31,Partially Negative
2972,someone would give speech loved hospital stupid comm class,-0.05,Partially Negative
6032,truong sunbae scratch said talking nonsense enjoy perfs,0.4,Partially Positive
9220,looked time hibernate hour,0.0,Neutral
12651,power seeeeeeee love storm,0.5,Positive
1928,hyper jumping everyhere friday sweeney todd cinco mayo party,0.0,Neutral
14353,extremely excited behind,-0.01,Neutral
19722,hard core dont know,-0.29,Partially Negative


In [7]:
df.dropna(inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27480 entries, 0 to 27480
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   textID              27480 non-null  object 
 1   text                27480 non-null  object 
 2   selected_text       27480 non-null  object 
 3   sentiment           27480 non-null  object 
 4   clean_text          27480 non-null  object 
 5   textblob_polarity   27480 non-null  float64
 6   sentiment_textblob  27480 non-null  object 
dtypes: float64(1), object(6)
memory usage: 1.7+ MB


In [9]:
sentiment_counts = df['sentiment_textblob'].value_counts()
sentiment_counts

Neutral               12058
Partially Positive     6598
Positive               4434
Partially Negative     2930
Negative               1460
Name: sentiment_textblob, dtype: int64

In [10]:
from imblearn.over_sampling import RandomOverSampler

# Extract the sentiment labels and features from your DataFrame
labels = df['sentiment_textblob']
features = df['clean_text']

# Create an instance of the RandomOverSampler
oversampler = RandomOverSampler(random_state=42)

# Apply the oversampling to balance the classes
features_balanced, labels_balanced = oversampler.fit_resample(features.values.reshape(-1, 1), labels)

# Convert the balanced features and labels back to a DataFrame
balanced_df = pd.DataFrame({'clean_text': features_balanced.flatten(), 'sentiment_textblob': labels_balanced})

# Display the balanced sentiment counts
balanced_sentiment_counts = balanced_df['sentiment_textblob'].value_counts()
print(balanced_sentiment_counts)

Neutral               12058
Positive              12058
Partially Positive    12058
Negative              12058
Partially Negative    12058
Name: sentiment_textblob, dtype: int64


In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming your balanced DataFrame is named 'balanced_df'
# Extract the balanced features and labels
features = balanced_df['clean_text']
labels = balanced_df['sentiment_textblob']

# Create an instance of the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the features
features_transformed = vectorizer.fit_transform(features)

# Convert the transformed features to a DataFrame
features_df = pd.DataFrame(features_transformed.toarray(), columns=vectorizer.get_feature_names_out())

# Concatenate the features DataFrame with the labels
data = pd.concat([features_df, labels], axis=1)

# Display the data
print(data.head())

   aaaa  aaaaaaaaaaa  aaaaaaaaaahhhhhhhh  aaaaaaaaaamazing  aaaaaaaafternoon  \
0   0.0          0.0                 0.0               0.0               0.0   
1   0.0          0.0                 0.0               0.0               0.0   
2   0.0          0.0                 0.0               0.0               0.0   
3   0.0          0.0                 0.0               0.0               0.0   
4   0.0          0.0                 0.0               0.0               0.0   

   aaaaaaaahhhhhhhh  aaaaaah  aaaaaahhhhhhhh  aaaaaawwwesome  aaaaahhhh  ...  \
0               0.0      0.0             0.0             0.0        0.0  ...   
1               0.0      0.0             0.0             0.0        0.0  ...   
2               0.0      0.0             0.0             0.0        0.0  ...   
3               0.0      0.0             0.0             0.0        0.0  ...   
4               0.0      0.0             0.0             0.0        0.0  ...   

   zumba  zune  zwarte  zwitschert  zy

In [12]:
from sklearn.model_selection import train_test_split

# Assuming your data is stored in the 'data' DataFrame
# Extract the features and labels
features = data.drop('sentiment_textblob', axis=1)  # Assuming the features are stored in columns other than the sentiment column
labels = data['sentiment_textblob']

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.2, random_state=42)

# Print the shapes of the split datasets
print("Train Features Shape:", train_features.shape)
print("Train Labels Shape:", train_labels.shape)
print("Test Features Shape:", test_features.shape)
print("Test Labels Shape:", test_labels.shape)


Train Features Shape: (48232, 19573)
Train Labels Shape: (48232,)
Test Features Shape: (12058, 19573)
Test Labels Shape: (12058,)


Model Training

In [13]:
from sklearn.naive_bayes import MultinomialNB

# Create an instance of the Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the classifier using the training data
nb_classifier.fit(train_features, train_labels)
