In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from google.colab import files

# Helper function to calculate metrics
def get_model_metrics(model_name, y_test, y_pred):
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')
    return {
        'Learning Model': model_name,
        'Precision': round(precision, 2),
        'Recall': round(recall, 2),
        'F1-score': round(f1, 2)
    }

# Function to apply different models
def apply_models(X_train_vec, X_test_vec, y_train, y_test):
    results = []

    # Naïve Bayes
    nb = MultinomialNB()
    nb.fit(X_train_vec, y_train)
    y_pred_nb = nb.predict(X_test_vec)
    results.append(get_model_metrics('Naïve Bayes', y_test, y_pred_nb))

    # SVM
    svm = SVC()
    svm.fit(X_train_vec, y_train)
    y_pred_svm = svm.predict(X_test_vec)
    results.append(get_model_metrics('SVM', y_test, y_pred_svm))

    # KNN
    knn = KNeighborsClassifier()
    knn.fit(X_train_vec, y_train)
    y_pred_knn = knn.predict(X_test_vec)
    results.append(get_model_metrics('KNN', y_test, y_pred_knn))

    # Logistic Regression
    lr = LogisticRegression()
    lr.fit(X_train_vec, y_train)
    y_pred_lr = lr.predict(X_test_vec)
    results.append(get_model_metrics('Logistic Regression', y_test, y_pred_lr))

    # Random Forest
    rf = RandomForestClassifier()
    rf.fit(X_train_vec, y_train)
    y_pred_rf = rf.predict(X_test_vec)
    results.append(get_model_metrics('Random Forest', y_test, y_pred_rf))

    # Decision Tree
    dt = DecisionTreeClassifier()
    dt.fit(X_train_vec, y_train)
    y_pred_dt = dt.predict(X_test_vec)
    results.append(get_model_metrics('Decision Tree', y_test, y_pred_dt))

    return results

# Upload the dataset
uploaded = files.upload()

# Load the dataset (assuming it's in Excel format)
file_name = next(iter(uploaded))
data = pd.read_excel(file_name)

# Clean the data and prepare features and labels
data = data.dropna(subset=['comments', 'tag'])  # Assuming 'Category' and 'Label' columns exist
X = data['comments']  # Features (comments or text)
y = data['tag']  # Target labels (positive, negative, neutral)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to numeric using CountVectorizer
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Apply the models and gather results
results = apply_models(X_train_vec, X_test_vec, y_train, y_test)

# Save the results to an Excel file
df_results = pd.DataFrame(results)
output_file = "model_comparison_results.xlsx"
df_results.to_excel(output_file, index=False)

# Download the output file
files.download(output_file)

print("Model comparison results have been saved and downloaded successfully.")


Saving 10%_data_file.xlsx to 10%_data_file.xlsx


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Model comparison results have been saved and downloaded successfully.


**Updated version**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from google.colab import files

# Helper function to calculate metrics
def get_model_metrics(model_name, y_test, y_pred):
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')
    accuracy = accuracy_score(y_test, y_pred)
    return {
        'Learning Model': model_name,
        'Precision': round(precision, 2),
        'Recall': round(recall, 2),
        'F1-score': round(f1, 2),
        'Accuracy': round(accuracy, 2)
    }

# Function to apply different models and evaluate performance
def apply_models(X_train_vec, X_test_vec, y_train, y_test):
    results = []

    # Naïve Bayes
    nb = MultinomialNB()
    nb.fit(X_train_vec, y_train)
    y_pred_nb = nb.predict(X_test_vec)
    results.append(get_model_metrics('Naïve Bayes', y_test, y_pred_nb))

    # SVM
    svm = SVC(kernel='linear')
    svm.fit(X_train_vec, y_train)
    y_pred_svm = svm.predict(X_test_vec)
    results.append(get_model_metrics('SVM', y_test, y_pred_svm))

    # KNN
    knn = KNeighborsClassifier()
    knn.fit(X_train_vec, y_train)
    y_pred_knn = knn.predict(X_test_vec)
    results.append(get_model_metrics('KNN', y_test, y_pred_knn))

    # Logistic Regression
    lr = LogisticRegression(max_iter=1000)
    lr.fit(X_train_vec, y_train)
    y_pred_lr = lr.predict(X_test_vec)
    results.append(get_model_metrics('Logistic Regression', y_test, y_pred_lr))

    # Random Forest
    rf = RandomForestClassifier(n_estimators=100, max_depth=10)
    rf.fit(X_train_vec, y_train)
    y_pred_rf = rf.predict(X_test_vec)
    results.append(get_model_metrics('Random Forest', y_test, y_pred_rf))

    # Decision Tree
    dt = DecisionTreeClassifier(max_depth=10)
    dt.fit(X_train_vec, y_train)
    y_pred_dt = dt.predict(X_test_vec)
    results.append(get_model_metrics('Decision Tree', y_test, y_pred_dt))

    # Gradient Boosting Classifier (usually performs better)
    gbc = GradientBoostingClassifier(n_estimators=100)
    gbc.fit(X_train_vec, y_train)
    y_pred_gbc = gbc.predict(X_test_vec)
    results.append(get_model_metrics('Gradient Boosting', y_test, y_pred_gbc))

    return results

# Upload the dataset
uploaded = files.upload()

# Load the dataset (assuming it's in Excel format)
file_name = next(iter(uploaded))
data = pd.read_excel(file_name)

# Clean the data and prepare features and labels
data = data.dropna(subset=['comments', 'tag'])  # Assuming 'comments' and 'tag' columns exist
X = data['comments']  # Features (comments or text)
y = data['tag']  # Target labels (positive, negative, neutral)

# Convert text data to numeric using TF-IDF Vectorizer (more sophisticated than CountVectorizer)
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
X_vec = vectorizer.fit_transform(X)

# Balance the dataset using SMOTE (for class imbalance) on numerical features
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_res, y_res = smote.fit_resample(X_vec, y)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Apply the models and gather results
results = apply_models(X_train, X_test, y_train, y_test)

# Save the results to an Excel file
df_results = pd.DataFrame(results)
output_file = "model_comparison_results_optimized.xlsx"
df_results.to_excel(output_file, index=False)

# Download the output file
files.download(output_file)

print("Model comparison results have been saved and downloaded successfully.")


Saving G1_Football & Other games & Wrosting video.xlsx to G1_Football & Other games & Wrosting video (1).xlsx


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Model comparison results have been saved and downloaded successfully.
