# Supervised Model Training

In [20]:
#[Source 1] Scikit-learn: Machine Learning in Python, Pedregosa et al., 2011. [Accessed 23/01/2024]
#[Source 2] Palivela, A.K. (2021). Sentiment Analysis with Machine Learning. [Online] Kaggle. Available at: [https://www.kaggle.com/code/ashokkumarpalivela/sentiment-analysis-with-machine-learning][Accessed 23/03/2024]
#[Source 3] Appiah, E.K., 2024. Sentiment analysis using SVM, Naive Bayes & RF. [online] Kaggle. Available at: https://www.kaggle.com/code/emmanuelkwasiappiah/sentiment-analysis-using-svm-naive-bayes-rf/notebook [Accessed 10/04/2024]
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings("ignore")
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
import nltk
nltk.download('punkt')
nltk.download('stopwords')

dataf = pd.read_csv("processed_data.csv")
dataf['sentiment'] = dataf['sentiment'].str.lower()
positive_samples = dataf[dataf['sentiment'] == 'positive']
negative_samples = dataf[dataf['sentiment'] == 'negative']
neutral_samples = dataf[dataf['sentiment'] == 'neutral'].sample(1900)  #undersampling for balance
dataf = pd.concat([positive_samples, negative_samples, neutral_samples], ignore_index=True)

def text_cleaning(text):
    text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))  # Remove punctuations
    text = ''.join([i for i in text if not i.isdigit()])  # Remove digits
    tokens = word_tokenize(text)  # Tokenize text
    stop_words = set(stopwords.words('english'))  # Get English stop words
    filtered_text = [word for word in tokens if word.lower() not in stop_words]  # Remove stop words
    cleaned_text = ' '.join(filtered_text).strip()  # Join words to form the cleaned text
    return cleaned_text

#apply text cleaning
dataf['cleaned_text'] = dataf['cleaned_text'].apply(text_cleaning)

#splitting the dataset
train_texts, test_texts, train_labels, test_labels = train_test_split(
    dataf['cleaned_text'], dataf['sentiment'], test_size=0.2, random_state=42
)
y_train = train_labels

def preprocess_and_feature_select(train_texts, test_texts, train_labels):
    vectorizer = TfidfVectorizer(max_df=0.25, ngram_range=(1, 2))
    train_vectors = vectorizer.fit_transform(train_texts)
    test_vectors = vectorizer.transform(test_texts)

    selector = SelectKBest(chi2, k='all')
    train_features = selector.fit_transform(train_vectors, train_labels)
    test_features = selector.transform(test_vectors)
    
    return train_features, test_features, vectorizer, selector

#unpack the returned values
X_train_selected, X_test_selected, vect, fs_selc = preprocess_and_feature_select(train_texts, test_texts, train_labels)


def model(features_train, labels_train, classifier_type, config):
    grid_search = GridSearchCV(config[classifier_type], config['params'][classifier_type], n_jobs=-1, verbose=1)
    grid_search.fit(features_train, labels_train)
    return grid_search.best_estimator_, grid_search.best_params_
config = {
    'MultinomialNB': MultinomialNB(),
    'SVC': SVC(),
    'RandomForest': RandomForestClassifier(),
    'params': {
        'MultinomialNB': {'alpha': (0.1, 0.01, 0.001)},
        'SVC': {'C': (1, 10, 100), 'kernel': ('linear', 'poly')},
        'RandomForest': {'n_estimators': (50, 150, 250), 'max_depth': (None, 15, 25)}
    }
}

best_model_nb, best_params_nb = model(X_train_selected, y_train, 'MultinomialNB', config)
best_model_svc, best_params_svc = model(X_train_selected, y_train, 'SVC', config)
best_model_rf, best_params_rf = model(X_train_selected, y_train, 'RandomForest', config)

#function is for evaluating classifier
def evaluate_classifier(clf, test_texts, test_labels):
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    predictions = clf.predict(test_texts)
    metrics = {
        'Accuracy': float(accuracy_score(test_labels, predictions)),
        'Precision': float(precision_score(test_labels, predictions, average='weighted')),
        'Recall': float(recall_score(test_labels, predictions, average='weighted')),
        'F1 Score': float(f1_score(test_labels, predictions, average='weighted'))
    }
    return metrics
#evaluation
accuracy_nb, precision_nb, recall_nb, f1_nb = evaluate_classifier(best_model_nb, X_test_selected, test_labels)
accuracy_svc, precision_svc, recall_svc, f1_svc = evaluate_classifier(best_model_svc, X_test_selected, test_labels)
accuracy_rf, precision_rf, recall_rf, f1_rf = evaluate_classifier(best_model_rf, X_test_selected, test_labels)

def predict_with_model(input_text, model):
    transformed_text = fs_selc.transform(vect.transform([input_text]))
    prediction = model.predict(transformed_text)
    return prediction

#trying out the system
example_text = "Obama is really good. He is for the people and made good decisions"
output_of_prediction = predict_with_model(example_text, best_model_nb)
print("Prediction using best_model_nb:", output_of_prediction)


[nltk_data] Downloading package punkt to /Users/zakya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/zakya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Fitting 5 folds for each of 3 candidates, totalling 15 fits
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Fitting 5 folds for each of 9 candidates, totalling 45 fits
