In [99]:
import pandas as pd
df = pd.read_csv('twitter_training.csv')
df.columns = ['id','topic','sentiment','text']
df.head()

Unnamed: 0,id,topic,sentiment,text
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [100]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74681 entries, 0 to 74680
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         74681 non-null  int64 
 1   topic      74681 non-null  object
 2   sentiment  74681 non-null  object
 3   text       73995 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [101]:
df['sentiment'].value_counts()

sentiment
Negative      22542
Positive      20831
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64

In [102]:
df.isnull().sum()

id             0
topic          0
sentiment      0
text         686
dtype: int64

In [103]:
df.duplicated().sum()

2700

In [104]:
df = df.dropna(subset=['text'])


In [105]:
df = df.drop_duplicates()

In [106]:
df['sentiment'] = df['sentiment'].replace('Irrelevant','Neutral')


In [107]:
import re
def clean_text(text):
    if isinstance(text,str):  # Make sure input is a string
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
        text = re.sub(r'@\w+', '', text)  # Remove mentions (@username)
        text = re.sub(r'#\w+', '', text)  # Remove hashtags (#tag)
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        text = re.sub(r'\d+', '', text)  # Remove digits
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
        return text
    else: 
        return ""


In [108]:
df['clean_text'] = df['text'].apply(clean_text)

In [109]:
from sklearn.model_selection import train_test_split

X = df['clean_text']
y = df['sentiment']
X_text_train,X_text_test,y_train,y_test = train_test_split(
    X, y, test_size=0.2,random_state=42
)

In [110]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)

X_train = vectorizer.fit_transform(X_text_train)
X_test = vectorizer.transform(X_text_test)



In [111]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [112]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)
model.fit(X_train , y_train)


In [113]:
y_pred = model.predict(X_test)
print(y_pred)
print(y_test)

[1 2 1 ... 1 1 1]
[1 1 2 ... 1 2 1]


In [114]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Accuracy", accuracy_score(y_test,y_pred))
print("\nClassification Report:\n",classification_report(y_test,y_pred,target_names=le.classes_))
print("\nConfusin Matrix:\n",confusion_matrix(y_test,y_pred))

Accuracy 0.7251413020724304

Classification Report:
               precision    recall  f1-score   support

    Negative       0.77      0.73      0.75      4433
     Neutral       0.70      0.77      0.74      5987
    Positive       0.72      0.65      0.68      3911

    accuracy                           0.73     14331
   macro avg       0.73      0.72      0.72     14331
weighted avg       0.73      0.73      0.72     14331


Confusin Matrix:
 [[3236  906  291]
 [ 648 4626  713]
 [ 327 1054 2530]]


In [115]:
import joblib
# ---------------- Save Model & Vectorizer ----------------
joblib.dump(model, 'logreg_sentiment_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(le, 'label_encoder.pkl')

print("Model, vectorizer, and label encoder saved successfully!")

Model, vectorizer, and label encoder saved successfully!


In [116]:
import joblib

# ---------------- Load Model ----------------
model = joblib.load('logreg_sentiment_model.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')
le = joblib.load('label_encoder.pkl')

In [117]:
def predict_sentiment(text):
    # Preprocess
    text_clean = re.sub(r'http\S+|www\S+', '', text.lower())
    text_clean = re.sub(r'@\w+', '', text_clean)
    text_clean = re.sub(r'#\w+', '', text_clean)
    text_clean = re.sub(r'[^\w\s]', '', text_clean)
    text_clean = re.sub(r'\d+', '', text_clean)
    text_clean = re.sub(r'\s+', ' ', text_clean).strip()
    
    # Vectorize
    vect = vectorizer.transform([text_clean])
    pred_enc = model.predict(vect)[0]
    pred_label = le.inverse_transform([pred_enc])[0]
    pred_proba = model.predict_proba(vect)[0].max()
    
    return {"text": text, "sentiment": pred_label, "accuracy": round(pred_proba, 3)}


In [131]:
example_text = "you look good"
result = predict_sentiment(example_text)
print(result)


{'text': 'you look good', 'sentiment': 'Neutral', 'accuracy': 0.649}


In [119]:
results = pd.DataFrame({
    'Tweet': X_text_test.values,
    'True Sentiment': le.inverse_transform(y_test),
    'Predicted sentiment': le.inverse_transform(y_pred)
})
results.sample(10)

Unnamed: 0,Tweet,True Sentiment,Predicted sentiment
3902,secretly praying that every part where its big...,Negative,Neutral
11453,well the fact bytedance is selling it us s ent...,Neutral,Neutral
4541,so another short,Neutral,Negative
2845,entrepreneur learning this microsoft excel kit...,Neutral,Positive
3965,lets not lie the fifa combo and ps are very go...,Positive,Positive
7247,amazing things are possible when we work toget...,Neutral,Neutral
8320,playing a head deer oh shit a kid starts runni...,Neutral,Neutral
2954,yooooooooooooooooooooooooooooooooooooooooooooo...,Neutral,Neutral
6354,lets stop showing good popular singers in shit...,Neutral,Positive
12335,i want to play cyberpunk right now so bad aaaa...,Positive,Positive
