In [1]:
import pandas as pd

all_data = pd.read_csv("data/task2_data.csv")
train = all_data[all_data["source"] == "train"]

In [28]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

tfidf = TfidfVectorizer(strip_accents="ascii",
                        analyzer="word",
                        stop_words="english",
                        ngram_range=(1,3),
                        min_df=2)
tfidf = tfidf.fit(all_data["context"])
X_train = tfidf.transform(train["text"])

le = LabelEncoder()
le = le.fit(train["label"])
y_train = le.transform(train["label"])

random_seed = 1956

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=.15, random_state=random_seed)


In [29]:
X_dev = all_data[all_data["source"]=="dev"]["text"]
X_test = all_data[all_data["source"]=="test"]["text"]

X_dev = tfidf.transform(X_dev)
X_test = tfidf.transform(X_test)

In [3]:
from sklearn import model_selection
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from mlxtend.classifier import StackingCVClassifier

import numpy as np
import warnings

warnings.simplefilter('ignore')

clf1 = LinearSVC(C=1000)
clf2 = AdaBoostClassifier()
clf3 = RandomForestClassifier(n_jobs=-1, max_depth=8, n_estimators=500)
lr = LogisticRegression()

sclf = StackingCVClassifier(classifiers = [clf1, clf2, clf3],
                            meta_classifier=lr,
                            random_state=random_seed)
params = {'adaboostclassifier__n_estimators': [50, 100, 150]}

# grid = GridSearchCV(estimator=sclf, 
#                     param_grid=params, 
#                     cv=3,
#                     refit=True)
# grid.fit(X_train, y_train)

In [42]:
# Generate predictions
from utils import generate_t2_sub
import pickle

ensembles =["stack_ensemble.pkl",
            "stack_ensemble2.pkl", 
            "stack_ensemble3.pkl", 
            "stack_ensemble4.pkl",
            "stack_ensemble5.pkl",
            "stack_ensemble6.pkl" ]

dev = all_data[all_data["source"]=="dev"]
test = all_data[all_data["source"]=="test"]

X_dev = tfidf.transform(dev["text"])
X_test = tfidf.transform(test["text"])

for e in ensembles:
    grid = pickle.load(open(e, "rb"))
    clf = grid.best_estimator_
    dev_pred = le.inverse_transform(clf.predict(X_dev))
    test_pred = le.inverse_transform(clf.predict(X_test))
    e_name = e.split(".pkl")[0]

    idx = 0
    with open("submissions/task2/dev_" + e_name + ".txt", "w") as f:
        for i, r in dev.iterrows():
            f.write(str(r.article_id)+ "\t" + dev_pred[idx] + "\t" + str(r.start) + "\t" + str(r.end) + "\n")
            idx += 1

    idx = 0
    with open("submissions/task2/test_" + e_name + ".txt", "w") as f:
        for i, r in test.iterrows():
            f.write(str(r.article_id)+ "\t" + test_pred[idx] + "\t" +  str(r.start) + "\t" + str(r.end) + "\n")
            idx += 1
print("finished")

finished


In [43]:
# Train SVC

In [17]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import pandas as pd

all_data = pd.read_csv("data/task2_data.csv")
train = all_data[all_data["source"] == "train"]

tfidf = TfidfVectorizer(stop_words="english",
                        min_df=2)

tfidf = tfidf.fit(all_data["text"])
X_train = tfidf.transform(train["text"])

le = LabelEncoder()
le = le.fit(train["label"])
y_train = le.transform(train["label"])

clf = SVC(C=1000, kernel="linear")
clf.fit(X_train, y_train)

dev = all_data[all_data["source"]=="dev"]
test = all_data[all_data["source"]=="test"]

X_dev = tfidf.transform(dev["text"])

dev_pred = clf.predict(X_dev)
dev_preds = le.inverse_transform(dev_pred)

with open("submissions/task2/dev_tfidf_svc.txt", "w") as f:
    idx=0
    for i, r in dev.iterrows():
        f.write(str(r.article_id)+ "\t" + dev_preds[idx] + "\t" + str(r.start) + "\t" + str(r.end) + "\n")
        idx += 1

In [18]:
len(dev_preds)

1063