## Data Collection

In [30]:
# Importing the necessary libraries
import pandas as pd
import re

In [31]:
# Step 1: Load Data
data = pd.read_csv('tweets.csv')

In [32]:
# Step 2: Data Preprocessing
def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.strip()  # Remove leading/trailing whitespace
    return text

# Apply preprocessing
# Convert all entries in the 'text' column to strings, replacing NaNs with empty strings
data['text'] = data['text'].fillna('').astype(str)

# Apply preprocessing
data['cleaned_text'] = data['text'].apply(preprocess_text)

# Verify the existence of 'cleaned_text' column
data.head()


Unnamed: 0,textID,text,selected_text,sentiment,cleaned_text
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,i d have responded if i were going
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,sooo sad i will miss you here in san diego
2,088c60f138,my boss is bullying me...,bullying me,negative,my boss is bullying me
3,9642c003ef,what interview! leave me alone,leave me alone,negative,what interview leave me alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,sons of why couldn t they put them on the rele...


## Feature Extraction without stopwords

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(data['cleaned_text']).toarray()
y = data['sentiment']

## Train/Test Split

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Sentiment Classification

In [35]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)

## Evaluation

In [36]:
from sklearn.metrics import accuracy_score, classification_report
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

Accuracy: 0.6956521739130435
Classification Report:
               precision    recall  f1-score   support

    negative       0.73      0.61      0.66      1562
     neutral       0.63      0.76      0.69      2230
    positive       0.78      0.70      0.74      1705

    accuracy                           0.70      5497
   macro avg       0.71      0.69      0.70      5497
weighted avg       0.71      0.70      0.70      5497



## Predict on new data

In [37]:
new_tweet = "I love this product! It's amazing."
new_tweet_cleaned = preprocess_text(new_tweet)
new_tweet_features = tfidf.transform([new_tweet_cleaned]).toarray()
sentiment = model.predict(new_tweet_features)
print(f"Predicted sentiment: {sentiment[0]}")

Predicted sentiment: positive
