In [15]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Load data
data = pd.read_csv('twitter_sentiment_data.csv')

# Sample 10,000 tweets
sample_data = data.sample(n=10000, random_state=42)

# Data preprocessing
sample_data['message'] = sample_data['message'].str.replace('[^a-zA-Z\s]', '', regex=True)
sample_data['message'] = sample_data['message'].str.lower()

# Split data into train and test sets
X = sample_data['message']
y = sample_data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Scale the data
scaler = StandardScaler(with_mean=False)  # Set with_mean=False for sparse matrices
X_train_vectorized = scaler.fit_transform(X_train_vectorized)
X_test_vectorized = scaler.transform(X_test_vectorized)

# Train logistic regression model
model = LogisticRegression(max_iter=1000)  # Increase max_iter
model.fit(X_train_vectorized, y_train)

# Evaluate model
y_pred = model.predict(X_test_vectorized)

# Classification report
report = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report).transpose()
report_df = report_df.rename(index={-1: 'Negative', 0: 'Neutral', 1: 'Positive'})
print("Classification Report:")
print(report_df)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
conf_df = pd.DataFrame(conf_matrix, index=['Negative', 'Neutral', 'Positive', 'Unknown'], columns=['Negative', 'Neutral', 'Positive', 'Unknown'])
print("\nConfusion Matrix:")
print(conf_df)

# Analyze misclassified samples
misclassified = X_test[y_pred != y_test]
print("\nMisclassified samples:")
print(misclassified.head(10))  # Print the first 10 misclassified samples

Classification Report:
              precision    recall  f1-score    support
-1             0.635294  0.315789  0.421875   171.0000
0              0.513228  0.277937  0.360595   349.0000
1              0.667384  0.879131  0.758761  1059.0000
2              0.679758  0.534442  0.598404   421.0000
accuracy       0.653500  0.653500  0.653500     0.6535
macro avg      0.623916  0.501825  0.534909  2000.0000
weighted avg   0.640345  0.653500  0.626722  2000.0000

Confusion Matrix:
          Negative  Neutral  Positive  Unknown
Negative        54       20        86       11
Neutral         18       97       205       29
Positive         8       54       931       66
Unknown          5       18       173      225

Misclassified samples:
27365    nro jimmykimmel theodorekupfer why dont you le...
20440         garrettlove damn global warming causing snow
22968    rt ketanj a simple  year old explanation of cl...
2935     a punishment of jail time amp forcefeeding mea...
13591    explain that m

In [17]:
report_df

Unnamed: 0,precision,recall,f1-score,support
-1,0.635294,0.315789,0.421875,171.0
0,0.513228,0.277937,0.360595,349.0
1,0.667384,0.879131,0.758761,1059.0
2,0.679758,0.534442,0.598404,421.0
accuracy,0.6535,0.6535,0.6535,0.6535
macro avg,0.623916,0.501825,0.534909,2000.0
weighted avg,0.640345,0.6535,0.626722,2000.0


In [18]:
conf_df

Unnamed: 0,Negative,Neutral,Positive,Unknown
Negative,54,20,86,11
Neutral,18,97,205,29
Positive,8,54,931,66
Unknown,5,18,173,225


In [19]:
print("\nMisclassified samples:")
misclassified


Misclassified samples:


27365    nro jimmykimmel theodorekupfer why dont you le...
20440         garrettlove damn global warming causing snow
22968    rt ketanj a simple  year old explanation of cl...
2935     a punishment of jail time amp forcefeeding mea...
13591    explain that my global warming advocates https...
                               ...                        
13277    rt unep south sudan launches united nations cl...
35648    fuck global warming brexit or north korea this...
32763    animal agriculture is the no  cause for global...
29031    rt pauledawson us government report finds stea...
34837    update  cape town held up as example of climat...
Name: message, Length: 693, dtype: object