# 🐦 Twitter Sentiment Analysis

This notebook performs **Sentiment Analysis on Twitter data** using **Text Mining** and **Machine Learning** techniques. Tweets are classified into **Positive**, **Negative**, **Neutral**, or **Irrelevant** categories.

In [None]:
# Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import re
import nltk
from nltk import word_tokenize

# Download NLTK data
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
# Load Data
train = pd.read_csv('/content/twitter_training.csv')
val = pd.read_csv('/content/twitter_validation.csv')

train.columns = ['id', 'information', 'type', 'text']
val.columns = ['id', 'information', 'type', 'text']

train_data = train.copy()
val_data = val.copy()
train_data.head()

In [None]:
# Text Preprocessing
def preprocess_text(df):
    df['lower'] = df['text'].str.lower()
    df['lower'] = df['lower'].apply(lambda x: str(x))
    df['lower'] = df['lower'].apply(lambda x: re.sub('[^A-Za-z0-9 ]+', ' ', x))
    return df

train_data = preprocess_text(train_data)
val_data = preprocess_text(val_data)

In [None]:
# WordCloud Visualization
def plot_wordcloud(df, sentiment):
    text = ''.join(df[df['type'] == sentiment]['lower'])
    wordcloud = WordCloud(
        max_font_size=100,
        max_words=100,
        background_color='black',
        scale=10,
        width=800,
        height=800
    ).generate(text)
    plt.figure(figsize=(10, 10))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'{sentiment} Tweets WordCloud')
    plt.show()

for sentiment in ['Negative', 'Positive', 'Irrelevant', 'Neutral']:
    plot_wordcloud(train_data, sentiment)

In [None]:
# Data Distribution Plot
plot1 = train.groupby(['information', 'type']).count().reset_index()
plt.figure(figsize=(20, 6))
sns.barplot(data=plot1, x='information', y='id', hue='type')
plt.xticks(rotation=90)
plt.xlabel('Brand')
plt.ylabel('Number of Tweets')
plt.title('Distribution of Tweets per Brand and Type')
plt.grid(True)
plt.show()

In [None]:
# Tokenization & Stopwords
tokens_text = [word_tokenize(str(word)) for word in train_data['lower']]
tokens_counter = [item for sublist in tokens_text for item in sublist]
print("Number of unique tokens:", len(set(tokens_counter)))

stop_words = nltk.corpus.stopwords.words('english')

In [None]:
# Bag of Words (1-gram) + Logistic Regression
bow_counts = CountVectorizer(tokenizer=word_tokenize, stop_words=stop_words, ngram_range=(1, 1))
reviews_train, reviews_test = train_test_split(train_data, test_size=0.2, random_state=0)

X_train_bow = bow_counts.fit_transform(reviews_train['lower'])
X_test_bow = bow_counts.transform(reviews_test['lower'])
y_train_bow = reviews_train['type']
y_test_bow = reviews_test['type']

model1 = LogisticRegression(C=1, solver='liblinear', max_iter=200)
model1.fit(X_train_bow, y_train_bow)

test_pred = model1.predict(X_test_bow)
print("1-gram Logistic Regression Accuracy (Test):", accuracy_score(y_test_bow, test_pred) * 100)

X_val_bow = bow_counts.transform(val_data['lower'])
y_val_bow = val_data['type']
val_pred = model1.predict(X_val_bow)
print("1-gram Logistic Regression Accuracy (Validation):", accuracy_score(y_val_bow, val_pred) * 100)

In [None]:
# Bag of Words (1-4 gram) + Logistic Regression
bow_counts_ngram = CountVectorizer(tokenizer=word_tokenize, ngram_range=(1, 4))
X_train_bow_ng = bow_counts_ngram.fit_transform(reviews_train['lower'])
X_test_bow_ng = bow_counts_ngram.transform(reviews_test['lower'])
X_val_bow_ng = bow_counts_ngram.transform(val_data['lower'])

model2 = LogisticRegression(C=0.9, solver='liblinear', max_iter=1500)
model2.fit(X_train_bow_ng, y_train_bow)

test_pred_ng = model2.predict(X_test_bow_ng)
print("1-4 gram Logistic Regression Accuracy (Test):", accuracy_score(y_test_bow, test_pred_ng) * 100)

val_pred_ng = model2.predict(X_val_bow_ng)
print("1-4 gram Logistic Regression Accuracy (Validation):", accuracy_score(y_val_bow, val_pred_ng) * 100)