In [5]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report,accuracy_score

In [6]:
df = pd.read_csv('Tweets.csv')
df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         27481 non-null  object
 1   text           27480 non-null  object
 2   selected_text  27480 non-null  object
 3   sentiment      27481 non-null  object
dtypes: object(4)
memory usage: 858.9+ KB


In [8]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27480 entries, 0 to 27480
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         27480 non-null  object
 1   text           27480 non-null  object
 2   selected_text  27480 non-null  object
 3   sentiment      27480 non-null  object
dtypes: object(4)
memory usage: 1.0+ MB


In [13]:
nltk.download('stopwords')
nltk.download('punkt_tab')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Devve\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Devve\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


In [14]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return " ".join(filtered_tokens)

In [16]:
df['processed_text'] = df['selected_text'].apply(preprocess_text)
df.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['processed_text'] = df['selected_text'].apply(preprocess_text)


Unnamed: 0,textID,text,selected_text,sentiment,processed_text
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,responded going
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,sooo sad
2,088c60f138,my boss is bullying me...,bullying me,negative,bullying
3,9642c003ef,what interview! leave me alone,leave me alone,negative,leave alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,sons
5,28b57f3990,http://www.dothebouncy.com/smf - some shameles...,http://www.dothebouncy.com/smf - some shameles...,neutral,http shameless plugging best rangers forum earth
6,6e0c6d75b1,2am feedings for the baby are fun when he is a...,fun,positive,fun
7,50e14c0bb8,Soooo high,Soooo high,neutral,soooo high
8,e050245fbd,Both of you,Both of you,neutral,
9,fc2cbefa9d,Journey!? Wow... u just became cooler. hehe....,Wow... u just became cooler.,positive,wow u became cooler


In [17]:
vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(df['processed_text'])
y = df['sentiment']
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 103462 stored elements and shape (27480, 15790)>
  Coords	Values
  (0, 11405)	0.9082144248358296
  (0, 5705)	0.41850514754316126
  (1, 12713)	0.807442868345684
  (1, 11725)	0.5899457723873394
  (2, 1908)	1.0
  (3, 7812)	0.6744410329932271
  (3, 378)	0.738328716098073
  (4, 12705)	1.0
  (5, 6632)	0.22633409245348549
  (5, 12128)	0.4440532037490442
  (5, 10439)	0.42695153965725474
  (5, 1307)	0.24466194445067554
  (5, 11050)	0.42695153965725474
  (5, 5224)	0.42695153965725474
  (5, 4219)	0.3806143876996512
  (6, 5380)	1.0
  (7, 12714)	0.6949296148487082
  (7, 6398)	0.7190777638101642
  (9, 15471)	0.4288998103261959
  (9, 1221)	0.6326211471962405
  (9, 3004)	0.644853035057032
  (10, 9054)	0.2095445984185566
  (10, 8164)	0.1743590392547024
  (10, 6549)	0.36543286815571785
  (10, 11158)	0.3786893110094886
  :	:
  (27470, 3578)	0.38592154098367465
  (27470, 355)	0.38592154098367465
  (27471, 7734)	1.0
  (27472, 13833)	0.2842764220

In [18]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

model = SVC(kernel='linear')

model.fit(X_train,y_train)

In [19]:
y_pred = model.predict(X_test)

print("Accuracy:",accuracy_score(y_test,y_pred))
print("classification_report:\n",classification_report(y_test,y_pred))

Accuracy: 0.8025836972343523
classification_report:
               precision    recall  f1-score   support

    negative       0.77      0.75      0.76      1572
     neutral       0.78      0.86      0.81      2236
    positive       0.88      0.79      0.83      1688

    accuracy                           0.80      5496
   macro avg       0.81      0.80      0.80      5496
weighted avg       0.81      0.80      0.80      5496



In [20]:
import pickle

# Example: Save a trained model and vectorizer

# Save the model
with open('svm_sentiment_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

# Save the vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)


In [21]:
# Loading
# Load the model
with open('svm_sentiment_model.pkl', 'rb') as model_file:
    model = pickle.load(model_file)

# Load the vectorizer
with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:
    vectorizer = pickle.load(vectorizer_file)

In [42]:
new_message = ["Luffy is still joyboy"]

new_message_processed = [preprocess_text(msg) for msg in new_message]
new_message_vectorized = vectorizer.transform(new_message_processed)

predictions = model.predict(new_message_vectorized)
print(predictions)

['neutral']
