# Code 1

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV

vectorizer = TfidfVectorizer(ngram_range=(1,2), 
                             min_df=0.001, 
                             max_df=0.75, 
                             stop_words='english')

X = vectorizer.fit_transform(data['clean_text'])
y = data['output']

print(X.shape, y.shape)

# get baseline performance
most_frequent = DummyClassifier(strategy='most_frequent')
print(cross_val_score(most_frequent, X, y=y, cv=5, n_jobs=-1, scoring="f1_micro").mean())

# fine-tune classifier
base_clf = CalibratedClassifierCV(cv=5,
    base_estimator=LogisticRegression(n_jobs=-1,
                                      solver='lbfgs'
                                     )
)
param_grid = {'base_estimator__C': [50, 20, 10, 1.0, 0.5, 0.1, 0.05, 0.01],
              'base_estimator__class_weight': ['balanced', 'auto']
             }
search = GridSearchCV(base_clf, param_grid, cv=5, scoring='f1_micro')
search.fit(X, y)

# use best classifier to get performance estimate
clf = search.best_estimator_.base_estimator
print(cross_val_score(clf, X, y=y, cv=5, n_jobs=-1, scoring="f1_micro").mean())

# Code 2

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.pipeline import Pipeline

# set up the sequence
pipe = Pipeline([
    ('reduce_dim', 'passthrough'),
    ('classifier', clf)
])

# specify selection range
N_FEATURES = [1800, 1500, 1000, 500, 300]
param_grid = [
    {
        'reduce_dim': [SelectKBest(chi2)],
        'reduce_dim__k': N_FEATURES
    },
]

# fit the model to different feature sets
grid = GridSearchCV(pipe, n_jobs=1, param_grid=param_grid, cv=5, scoring='f1_micro')
grid.fit(X, y)

# save the best selector
selector = grid.best_params_['reduce_dim']
X_sel = selector.transform(X)

# refit classifier on entire, dimensionality-reduced data set
clf.fit(X_sel, y)

cv_reg = cross_val_score(clf, X_sel, y=y, cv=5, n_jobs=-1, scoring="f1_micro")
print("5-CV on train: {}".format(cv_reg.mean()))


# Code 3

In [None]:
# read in new data set
# transform text into word counts
# IMPORTANT: use the same vectorizer we fit on training data to create vectors!
Z = vectorizer.transform(new_data['clean_text'])

# select features for new data
Z_sel = selector.transform(Z)

# use best classifier to predict labels
predictions = clf.predict(Z_sel)
