# Imports

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import MultinomialNB

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

# Data loading

In [None]:
read_path = "/content/drive/MyDrive/Colab Notebooks/M2/data/datasets/newsmediabias-clean.csv"

df = pd.read_csv(read_path)

df.head()

Unnamed: 0,text,dimension,biased_words,aspect,label,sentiment,preprocessed_text
0,No picnic my phone smells like citrus.,Ageism and sentiment,[],Racial,Neutral,Positive,picnic phone smell like citrus
1,"@eloquentembrace You're going to kill me, but ...",Ageism and sentiment,[],Racial,Slightly Biased,Negative,going kill seen d waiting till one solid week ...
2,White trash vs us... We were outnumbered. htt...,Ageism and sentiment,['white trash'],Racial,Slightly Biased,Neutral,white trash v u outnumbered
3,poor mel. Feeling your pain.,Ageism and sentiment,[],Racial,Slightly Biased,Negative,poor mel feeling pain
4,I need some selsun blue...pretty sure i have a...,Ageism and sentiment,['rot'],Racial,Slightly Biased,Positive,need selsun blue pretty sure small spot haole rot


# Evaluating Support Vector Machine classifier

In [None]:
for num_samples in [10000, 20000]:
  for num_features in [1000, 2000]:
    for kernel in ['linear', 'rbf']:
      for C in [0.5, 1.0, 1.5]:
        print("Start: ", num_samples, num_features, kernel, C)

        boolean_indices = df['preprocessed_text'].notna()
        sampled_data = df[boolean_indices].sample(n = num_samples, random_state = 42)
        data = sampled_data.loc[boolean_indices, 'preprocessed_text']
        target = sampled_data.loc[boolean_indices, 'label'].map({'Neutral': 0, 'Slightly Biased': 1, 'Highly Biased': 2})

        vectorizer = CountVectorizer(max_features = num_features)
        vectorized_data = vectorizer.fit_transform(data)

        svm_model = SVC(kernel = kernel, C = C, random_state = 42)
        pipe = Pipeline([('transformer', TfidfTransformer()), ('svc', svm_model)])
        scores = cross_validate(pipe, vectorized_data, target, cv = 5, scoring = ('accuracy', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
        results_df = pd.DataFrame(scores)

        results_path = f"/content/drive/MyDrive/Colab Notebooks/M2/data/results/results_svm/samples_{num_samples}&features_{num_features}&kernel_{kernel}&C_{C}.csv"
        results_df.to_csv(results_path, index = False)

        print("Finish: ", num_samples, num_features, kernel, C)

# Evaluating Naive Bayes classifier

In [1]:
for num_samples in [10000, 20000, 39000]:
  for num_features in [1000, 2000, 5000, 10000]:
    print("Start: ", num_samples, num_features)

    boolean_indices = df['preprocessed_text'].notna()
    sampled_data = df[boolean_indices].sample(n = num_samples, random_state = 42)
    data = sampled_data.loc[boolean_indices, 'preprocessed_text']
    target = sampled_data.loc[boolean_indices, 'label'].map({'Neutral': 0, 'Slightly Biased': 1, 'Highly Biased': 2})

    vectorizer = CountVectorizer(max_features = num_features)
    vectorized_data = vectorizer.fit_transform(data)

    nb_model = MultinomialNB()
    pipe = Pipeline([('transformer', TfidfTransformer()), ('svc', nb_model)])
    scores = cross_validate(pipe, vectorized_data, target, cv = 5, scoring = ('accuracy', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
    results_df = pd.DataFrame(scores)

    results_path = f"/content/drive/MyDrive/Colab Notebooks/M2/data/results/results_naive_bayes/samples_{num_samples}&features_{num_features}.csv"
    results_df.to_csv(results_path, index = False)

    print("Finish: ", num_samples, num_features)