In [None]:
import pandas as pd

# Load the dataset
file_path = "/content/train.csv"
df = pd.read_csv(file_path)

# Inspect the dataset
print(df.head())


: 

In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

def preprocess_text(text):
    # Remove special characters, links, and user mentions
    text = re.sub(r'http\S+', '', text)
    text = re.sub('@[^\s]+', '', text)
    text = re.sub(r'#', '', text)

    # Tokenization
    tokens = word_tokenize(text)

    # Lowercasing
    tokens = [word.lower() for word in tokens]

    # Remove stop words, punctuation, and special characters
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    return ' '.join(tokens)

# Apply preprocessing to the 'tweet' column
df['cleaned_tweet'] = df['tweet'].apply(preprocess_text)

# Save the preprocessed dataset
df.to_csv('preprocessed_dataset.csv', index=False)


: 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Create a Bag-of-Words model
vectorizer_bow = CountVectorizer(max_features=5000)
X_bow = vectorizer_bow.fit_transform(df['cleaned_tweet'])

# Save the BoW feature matrix
pd.DataFrame(X_bow.toarray(), columns=vectorizer_bow.get_feature_names_out()).to_csv('bow_features.csv', index=False)


: 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF model
vectorizer_tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer_tfidf.fit_transform(df['cleaned_tweet'])

# Save the TF-IDF feature matrix
pd.DataFrame(X_tfidf.toarray(), columns=vectorizer_tfidf.get_feature_names_out()).to_csv('tfidf_features.csv', index=False)


: 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_bow, df['label'], test_size=0.2, random_state=42)

# Train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


: 

In [None]:

# Create a TF-IDF model
vectorizer_tfidf = TfidfVectorizer()
X_tfidf = vectorizer_tfidf.fit_transform(df['cleaned_tweet'])

# Split the dataset for TF-IDF
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, df['label'], test_size=0.2, random_state=42)

# Train the Logistic Regression model with TF-IDF features
model_tfidf = LogisticRegression()
model_tfidf.fit(X_train_tfidf, y_train_tfidf)

# Make predictions
y_pred_tfidf = model_tfidf.predict(X_test_tfidf)

# Evaluate the model with TF-IDF features
accuracy_tfidf = accuracy_score(y_test_tfidf, y_pred_tfidf)
precision_tfidf = precision_score(y_test_tfidf, y_pred_tfidf)
recall_tfidf = recall_score(y_test_tfidf, y_pred_tfidf)
f1_tfidf = f1_score(y_test_tfidf, y_pred_tfidf)

# Print evaluation metrics for TF-IDF
print("\nTF-IDF Feature Extraction:")
print(f"Accuracy: {accuracy_tfidf:.4f}")
print(f"Precision: {precision_tfidf:.4f}")
print(f"Recall: {recall_tfidf:.4f}")
print(f"F1 Score: {f1_tfidf:.4f}")

# Comparative Analysis
print("\nComparative Analysis:")
print("BoW                    vs           TF-IDF")
print(f"Accuracy -> BoW: {accuracy:.4f}       TF-IDF: {accuracy_tfidf:.4f}")
print(f"Precision -> BoW: {precision:.4f}     TF-IDF: {precision_tfidf:.4f}")
print(f"Recall -> BoW: {recall:.4f}          TF-IDF: {recall_tfidf:.4f}")
print(f"F1 Score -> BoW: {f1:.4f}           TF-IDF: {f1_tfidf:.4f}")


: 