In [13]:
import pandas as pd

all_data = pd.read_csv("data/task2_data.csv")
train = all_data[all_data["source"] == "train"]

In [36]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

tfidf = TfidfVectorizer(strip_accents="ascii",
                        analyzer="word",
                        stop_words="english",
                        ngram_range=(1,3),
                        min_df=2)
tfidf = tfidf.fit(all_data["context"])
X_train = tfidf.transform(train["text"])

le = LabelEncoder()
le = le.fit(train["label"])
y_train = le.transform(train["label"])

random_seed = 1956

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=.15, random_state=random_seed)

In [42]:
from sklearn import model_selection
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from mlxtend.classifier import StackingCVClassifier

import numpy as np
import warnings

warnings.simplefilter('ignore')

clf1 = LinearSVC()
clf2 = AdaBoostClassifier()
clf3 = RandomForestClassifier(n_jobs=-1)
lr = LogisticRegression()

sclf = StackingCVClassifier(classifiers = [clf1, clf2, clf3],
                            meta_classifier=lr,
                            random_state=random_seed)
params = {'linearsvc__C': [1, 10, 100, 1000],
          'adaboostclassifier__n_estimators': [50, 100, 150],
          'randomforestclassifier__n_estimators': [100, 200, 500],
          'randomforestclassifier__max_depth': [2,4,8,10]}

grid = GridSearchCV(estimator=sclf, 
                    param_grid=params, 
                    cv=3,
                    refit=True)
grid.fit(X_train, y_train)

print('Best parameters: %s' % grid.best_params_)
print('Accuracy: %.2f' % grid.best_score_)

In [38]:
len(le.classes_)

14