In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

def prepare_datasets(corpus, labels, test_data_proportion=0.3, random_state=42):
    train_X, test_X, train_Y, test_Y = train_test_split(corpus, labels,
                                                        test_size=test_data_proportion,
                                                        random_state=random_state)
    return train_X, test_X, train_Y, test_Y
    
def tfidf_extractor(corpus, ngram_range=(1,2)):
    vectorizer = TfidfVectorizer(min_df=1,
                                norm='l2',
                                smooth_idf=True,
                                use_idf=True,
                                ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features
    
df = pd.read_csv('sentiment_6000.csv')
df.head(10)

# Split data
x_train, x_test, y_train, y_test = prepare_datasets(df['content'], df['sentimen'], test_data_proportion=0.3)


# Convert text labels to numerical labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Ensure the lengths match after splitting
y_train_encoded = y_train_encoded[:len(x_train)]
y_test_encoded = y_test_encoded[:len(x_test)]


# TF-IDF Vectorization
tfidf_vectorizer, tfidf_train_features = tfidf_extractor(x_train)
tfidf_test_features = tfidf_vectorizer.transform(x_test)

In [2]:
from sklearn.naive_bayes import MultinomialNB
from scipy.sparse import hstack

# Train Naive Bayes model using TF-IDF features
nb_classifier_tfidf = MultinomialNB(alpha= 0.1,fit_prior=False)
nb_classifier_tfidf.fit(tfidf_train_features, y_train_encoded)

# Predictions on test set using TF-IDF features
tfidf_test_predictions = nb_classifier_tfidf.predict(tfidf_test_features)

# Evaluate accuracy using TF-IDF features
tfidf_accuracy = (tfidf_test_predictions == y_test_encoded).mean()
print("Accuracy using TF-IDF features:", tfidf_accuracy)

# Bag of Words (BoW) features
count_vectorizer, count_train_features = tfidf_extractor(x_train, ngram_range=(1, 1))  # Unigram features
count_test_features = count_vectorizer.transform(x_test)

# Combine TF-IDF and BoW features
combined_train_features = hstack([tfidf_train_features, count_train_features])
combined_test_features = hstack([tfidf_test_features, count_test_features])

# Train Naive Bayes model using combined features
nb_classifier_combined = MultinomialNB(alpha= 0.1,fit_prior=False)
nb_classifier_combined.fit(combined_train_features, y_train_encoded)

# Predictions on test set using combined features
combined_test_predictions = nb_classifier_combined.predict(combined_test_features)

# Evaluate accuracy using combined features
combined_accuracy = (combined_test_predictions == y_test_encoded).mean()
print("Accuracy using combined TF-IDF and Bag of Words features:", combined_accuracy)


Accuracy using TF-IDF features: 0.7366666666666667
Accuracy using combined TF-IDF and Bag of Words features: 0.74


In [3]:
from sklearn.metrics import f1_score

In [4]:
# Compute F1 score for predictions using TF-IDF features
tfidf_f1_score = f1_score(y_test_encoded, tfidf_test_predictions, average='weighted')
print("F1 score using TF-IDF features:", tfidf_f1_score)

# Compute F1 score for predictions using combined features
combined_f1_score = f1_score(y_test_encoded, combined_test_predictions, average='weighted')
print("F1 score using combined TF-IDF and Bag of Words features:", combined_f1_score)

F1 score using TF-IDF features: 0.7324275661448926
F1 score using combined TF-IDF and Bag of Words features: 0.7359718028448735


In [5]:
from sklearn.svm import SVC

# Train SVM model using TF-IDF features
svm_classifier_tfidf = SVC(C=100, gamma=0.1, kernel='sigmoid', random_state=42)
svm_classifier_tfidf.fit(tfidf_train_features, y_train_encoded)

# Predictions on test set using TF-IDF features
svm_tfidf_test_predictions = svm_classifier_tfidf.predict(tfidf_test_features)

# Evaluate accuracy using TF-IDF features
svm_tfidf_accuracy = (svm_tfidf_test_predictions == y_test_encoded).mean()
print("Accuracy using TF-IDF features with SVM:", svm_tfidf_accuracy)

# Combine TF-IDF and BoW features for SVM
combined_train_features_svm = hstack([tfidf_train_features, count_train_features])
combined_test_features_svm = hstack([tfidf_test_features, count_test_features])

# Train SVM model using combined features
svm_classifier_combined = SVC(C=100, gamma=0.1, kernel='sigmoid', random_state=42)
svm_classifier_combined.fit(combined_train_features_svm, y_train_encoded)

# Predictions on test set using combined features
svm_combined_test_predictions = svm_classifier_combined.predict(combined_test_features_svm)

# Evaluate accuracy using combined features
svm_combined_accuracy = (svm_combined_test_predictions == y_test_encoded).mean()
print("Accuracy using combined TF-IDF and Bag of Words features with SVM:", svm_combined_accuracy)


Accuracy using TF-IDF features with SVM: 0.7783333333333333
Accuracy using combined TF-IDF and Bag of Words features with SVM: 0.7716666666666666


In [6]:
from sklearn.linear_model import LogisticRegression

# Train Logistic Regression model using TF-IDF features
logreg_classifier_tfidf = LogisticRegression(C=10,max_iter=1000)
logreg_classifier_tfidf.fit(tfidf_train_features, y_train_encoded)

# Predictions on test set using TF-IDF features
logreg_tfidf_test_predictions = logreg_classifier_tfidf.predict(tfidf_test_features)

# Evaluate accuracy using TF-IDF features
logreg_tfidf_accuracy = (logreg_tfidf_test_predictions == y_test_encoded).mean()
print("Accuracy using TF-IDF features with Logistic Regression:", logreg_tfidf_accuracy)

# Combine TF-IDF and BoW features for Logistic Regression
combined_train_features_logreg = hstack([tfidf_train_features, count_train_features])
combined_test_features_logreg = hstack([tfidf_test_features, count_test_features])

# Train Logistic Regression model using combined features
logreg_classifier_combined = LogisticRegression(C=10,max_iter=1000)
logreg_classifier_combined.fit(combined_train_features_logreg, y_train_encoded)

# Predictions on test set using combined features
logreg_combined_test_predictions = logreg_classifier_combined.predict(combined_test_features_logreg)

# Evaluate accuracy using combined features
logreg_combined_accuracy = (logreg_combined_test_predictions == y_test_encoded).mean()
print("Accuracy using combined TF-IDF and Bag of Words features with Logistic Regression:", logreg_combined_accuracy)


Accuracy using TF-IDF features with Logistic Regression: 0.775
Accuracy using combined TF-IDF and Bag of Words features with Logistic Regression: 0.7822222222222223


In [7]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest model using TF-IDF features
rf_classifier_tfidf = RandomForestClassifier(min_samples_split=10,min_samples_leaf=2,n_estimators=150, random_state=42)
rf_classifier_tfidf.fit(tfidf_train_features, y_train_encoded)

# Predictions on test set using TF-IDF features
rf_tfidf_test_predictions = rf_classifier_tfidf.predict(tfidf_test_features)

# Evaluate accuracy using TF-IDF features
rf_tfidf_accuracy = (rf_tfidf_test_predictions == y_test_encoded).mean()
print("Accuracy using TF-IDF features with Random Forest:", rf_tfidf_accuracy)

# Combine TF-IDF and BoW features for Random Forest
combined_train_features_rf = hstack([tfidf_train_features, count_train_features])
combined_test_features_rf = hstack([tfidf_test_features, count_test_features])

# Train Random Forest model using combined features
rf_classifier_combined = RandomForestClassifier(min_samples_split=10,min_samples_leaf=2,n_estimators=150, random_state=42)
rf_classifier_combined.fit(combined_train_features_rf, y_train_encoded)

# Predictions on test set using combined features
rf_combined_test_predictions = rf_classifier_combined.predict(combined_test_features_rf)

# Evaluate accuracy using combined features
rf_combined_accuracy = (rf_combined_test_predictions == y_test_encoded).mean()
print("Accuracy using combined TF-IDF and Bag of Words features with Random Forest:", rf_combined_accuracy)


Accuracy using TF-IDF features with Random Forest: 0.7566666666666667
Accuracy using combined TF-IDF and Bag of Words features with Random Forest: 0.7622222222222222
