In [None]:
import pandas as pd

DATASET_FILE = "../data/train.csv"

In [None]:
from disaster_tweets.sklearn_models import get_dataset

train_data, test_data = get_dataset(DATASET_FILE)

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


MODELS = {
    "sgd": SGDClassifier(),
    "random forest": RandomForestClassifier(),
    "svc": SVC(),
}

In [None]:
from sklearn.metrics import classification_report

def train_and_test(model):
    model.fit(train_data, train_data["target"])
    predictions = model.predict(test_data)
    return classification_report(test_data["target"], predictions, output_dict=True)

In [None]:
from time import perf_counter
from disaster_tweets.sklearn_models import create_pipeline, classification_heatmap, report_to_df

reports = {}
for name, classifier in MODELS.items():
    start = perf_counter()
    model = create_pipeline(classifier)
    report = report_to_df(train_and_test(model))
    end = perf_counter()
    print()
    print(f"model: {name}, training time: {end - start}s")
    print(report)
    reports[name] = report

In [None]:
for name, report in reports.items():
    classification_heatmap(report, name)

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html

sgd_model = create_pipeline(SGDClassifier())
sgd_model.get_params()

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer

params = {
    "classifier__loss": ["hinge", "log", "modified_huber", "squared_hinge"],
    "classifier__max_iter": [1000, 10_000, 100],
}

grid_search = GridSearchCV(sgd_model, params, scoring=make_scorer(f1_score))
grid_search.fit(train_data, train_data["target"])
grid_search.cv_results_

In [None]:
grid_search.best_estimator_, grid_search.best_score_

In [None]:
model = grid_search.best_estimator_
predictions = model.predict(test_data)
report = classification_report(test_data["target"], predictions, output_dict=True)
classification_heatmap(report_to_df(report), "Best Estimator")

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from disaster_tweets.tokenizer import tokenize

sgd_model_tuned = create_pipeline(SGDClassifier(loss="log", max_iter=10_000), vectorizer=TfidfVectorizer, tokenizer=tokenize)
start = perf_counter()
report = report_to_df(train_and_test(sgd_model_tuned))
end = perf_counter()
end - start

In [None]:
classification_heatmap(report, "sgd_model_tuned")

In [None]:
report