In [61]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
import numpy as np

In [23]:
train = pd.read_csv('../data/train.csv', header=None, names=["text"])
hidden = pd.read_csv('../data/hidden.csv', header=None, names=["text"])
answers = pd.read_csv('../data/answers.csv', header=None, names=["text"])

In [24]:
def separate_xy(df, prefix="train"):
    df[prefix + "_X"] = df["text"].apply(lambda x: x[:-1])
    df[prefix + "_y"] = df["text"].apply(lambda x: x[-1])
    return df

train = separate_xy(train, 'train')
hidden = separate_xy(hidden, 'hidden')
answers = separate_xy(answers, 'answers')


In [25]:
# all hidden X values are identical to answers X
sum(answers["answers_X"] == hidden["hidden_X"]) == len(answers)

True

In [39]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1,1))
train_X_tfidf = vectorizer.fit_transform(train["train_X"])
train_y = train["train_y"]

test_X_tfidf = vectorizer.transform(answers["answers_X"])
test_y = answers["answers_y"]

In [40]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

# Naive Bayes Classifier
nb_model = MultinomialNB()
nb_model.fit(train_X_tfidf, train_y)


In [41]:

# SVM Classifier
svm_model = SVC(probability=True)
svm_model.fit(train_X_tfidf, train_y)


In [43]:
# Predict with Naive Bayes
nb_predictions = nb_model.predict(test_X_tfidf)

# Predict with SVM
svm_predictions = svm_model.predict(test_X_tfidf)

print("Naive Bayes Predictions:", nb_predictions)
print("SVM Predictions:", svm_predictions)


Naive Bayes Predictions: ['S' '1' '2' 'R' 'S' 'S' '1' 'S' 'R' 'G' 'S' '2' 'S' 'S' 'S' 'R' 'S' 'S'
 'S' 'P' '2' 'S' 'P' 'S' 'R' 'R' 'S' 'R' 'R' 'P' 'S' 'S' 'P' 'S' 'G' 'R'
 'G' 'S' 'P' '2' 'S' 'R' 'R' 'R' '1' 'S' 'S' 'S' 'P' 'S' 'K' 'R' 'G' 'K'
 'P' 'G' 'S' 'R' '2' 'S' '2' 'S' 'S' 'P' 'R' 'S' '1' 'R' 'S' 'S' 'R' '2'
 'R' 'K' 'P' 'S' '2' 'S' 'R' 'R' '2' 'R' 'S' 'R' 'S' 'S' 'P' 'S' 'K' 'S'
 'S' 'P' 'G' 'S' 'P' 'R' 'P' 'G' 'S' 'K' 'P' 'P' 'S' 'R' '2' '2' '2' 'P'
 'S' 'G' 'G' 'P' 'P' 'S' 'G' 'K' 'R' 'S' 'G' 'S' 'S' 'R' 'S' 'P' 'G' 'R'
 'S' 'P' 'R' '2' 'P' '2' 'S' 'P' 'G' 'K' 'P' 'S' 'R' 'S' 'S' '1' 'R' 'G'
 'S' '2' 'R' 'R' 'R' 'P' '2' 'G' 'R' 'G' '1' 'K' 'S' 'S' 'S' 'R' 'S' 'R'
 'P' 'S' 'W' 'K' 'R' 'P' 'S' '2' 'S' 'P' 'P' '2' '2' 'S' '1' 'S' 'R' 'P'
 'P' 'S' 'G' '1' '1' '2' 'P' 'P' 'S' 'G' 'K' 'S' '1' 'R' 'P' 'K' 'S' 'R'
 'S' 'S' 'G' '2' 'G' '2' 'P' '2' 'S' '1' 'G' 'P' 'S' 'S' 'S' 'R' 'R' '1'
 'K' 'S' 'K' 'S' 'R' 'P' 'S' 'S' 'G' 'S' 'S' 'S' 'S' '2' 'S' 'P' 'S' 'S'
 'S' 'S' 'R' 'P' 'R' 'S' '

In [55]:
sum(np.array(test_y) == nb_predictions) / len(test_y)

0.5408970976253298

In [56]:
sum(np.array(test_y) == svm_predictions) / len(test_y)

0.7255936675461742

In [None]:
# Great!! we got 72.5% with a no-brainer SVM clf for now. Let's explore different kernels

In [60]:
# SVM Classifier
svm_rbf = SVC(kernel='rbf', probability=True)
svm_rbf.fit(train_X_tfidf, train_y)

svm_rbf_predictions = svm_rbf.predict(test_X_tfidf)

sum(np.array(test_y) == svm_rbf_predictions) / len(test_y)

0.7255936675461742

In [59]:

# SVM Classifier
svm_poly = SVC(kernel='poly', degree=3, probability=True)
svm_poly.fit(train_X_tfidf, train_y)

svm_poly_predictions = svm_poly.predict(test_X_tfidf)

sum(np.array(test_y) == svm_poly_predictions) / len(test_y)

0.6912928759894459

In [75]:

# Assuming train_X_tfidf and train_y are already defined

# Define the parameter grid: specify the parameters and their possible values
param_grid = {
    'C': [0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1, ],  # Regularization parameter
    'gamma': ['scale'],  # Kernel coefficient for 'rbf', 'poly' and 'sigmoid'
    'kernel': ['rbf']  # Different kernels
}

# Create a GridSearchCV object
grid_search = GridSearchCV(SVC(probability=True), param_grid, refit=True, verbose=3, cv=5) # 5-fold cross-validation

# Fit the model on the training data
grid_search.fit(train_X_tfidf, train_y)

# Print the best parameters found
print("Best parameters found: ", grid_search.best_params_)

# Predict on the test set using the best model
best_svm = grid_search.best_estimator_
svm_predictions = best_svm.predict(test_X_tfidf)

# Calculate and print the accuracy
accuracy = sum(np.array(test_y) == svm_predictions) / len(test_y)
print("Accuracy:", accuracy)

# Optionally, print a detailed classification report
print(classification_report(test_y, svm_predictions))


Fitting 5 folds for each of 7 candidates, totalling 35 fits




[CV 1/5] END ....C=0.7, gamma=scale, kernel=rbf;, score=0.281 total time=   1.6s
[CV 2/5] END ....C=0.7, gamma=scale, kernel=rbf;, score=0.306 total time=   1.6s
[CV 3/5] END ....C=0.7, gamma=scale, kernel=rbf;, score=0.299 total time=   1.6s
[CV 4/5] END ....C=0.7, gamma=scale, kernel=rbf;, score=0.311 total time=   1.5s
[CV 5/5] END ....C=0.7, gamma=scale, kernel=rbf;, score=0.331 total time=   1.5s
[CV 1/5] END ...C=0.75, gamma=scale, kernel=rbf;, score=0.289 total time=   1.7s
[CV 2/5] END ...C=0.75, gamma=scale, kernel=rbf;, score=0.308 total time=   1.7s
[CV 3/5] END ...C=0.75, gamma=scale, kernel=rbf;, score=0.299 total time=   1.6s
[CV 4/5] END ...C=0.75, gamma=scale, kernel=rbf;, score=0.313 total time=   1.5s
[CV 5/5] END ...C=0.75, gamma=scale, kernel=rbf;, score=0.333 total time=   1.5s
[CV 1/5] END ....C=0.8, gamma=scale, kernel=rbf;, score=0.289 total time=   1.5s
[CV 2/5] END ....C=0.8, gamma=scale, kernel=rbf;, score=0.314 total time=   1.5s
[CV 3/5] END ....C=0.8, gamm

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [85]:

# Best SVM Classifier
svm_best = SVC(**{'C': 0.80, 'gamma': 'scale', 'kernel': 'rbf'}, probability=True)
svm_best.fit(train_X_tfidf, train_y)

svm_best_predictions = svm_best.predict(test_X_tfidf)

sum(np.array(test_y) == svm_best_predictions) / len(test_y)

0.7308707124010554