In [1]:
### Data reading
import pandas as pd

train = pd.read_csv('essays-train.txt', sep='\t', header=0, encoding='utf-8')
test = pd.read_csv('essays-test.txt', sep='\t', header=0, encoding='utf-8')
ext = pd.read_csv('ext_data.csv', sep='\t', header=0, encoding='utf-8')

In [2]:
### Data preprocessing
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

def preprocess(data):
    # removing punctuation
    data['text'] = data['text'].apply(lambda x: x.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))))

    # removing stopwords
    sw = stopwords.words('english')
    data['text'] = data['text'].apply(lambda x: " ".join(x for x in x.split() if x not in sw))

    # stemming
    st = PorterStemmer()
    data['text'] = data['text'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
    
    return data

# preprocess the text data of training data, test data and external data
train = preprocess(train)
test = preprocess(test)
external_data = preprocess(ext)

In [52]:
### Features selection and Modelling to build a pipeline
from sklearn.pipeline import Pipeline

# vectorizing - Countvectorizing, TF-IDF
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,TfidfTransformer
# different models - Logistic Regression, Support Vector Machine and Multinomial Naive Bayes.
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
# from sklearn import svm


def pipeline(model,feat):
    if model == 'LR':
        # Logistic regression model using countvectors and TF-IDF
        if feat == 'count':
            return Pipeline([('vect', CountVectorizer()), ('clf', LogisticRegression())])
        if feat == 'tfidf':
            return Pipeline([('tfidf', TfidfVectorizer(min_df=6, max_features=None, strip_accents='unicode',
                                                       analyzer="word", token_pattern=r'\w{1,}', ngram_range=(1, 2),
                                                       use_idf=1, smooth_idf=1, sublinear_tf=1)), 
                             ('clf', LogisticRegression())])
    
    if model == 'NB':
        # Multinomial Naive Bayes model using countvectors and TF-IDF
        if feat == 'count':
            return Pipeline([('vect', CountVectorizer()), ('clf', MultinomialNB())])
        if feat == 'tfidf':
            return Pipeline([('tfidf', TfidfVectorizer(min_df=6, max_features=None, strip_accents='unicode',
                                                       analyzer="word", token_pattern=r'\w{1,}', ngram_range=(1, 2),
                                                       use_idf=1, smooth_idf=1, sublinear_tf=1)),
                             ('clf', MultinomialNB())])    
    if model == 'SVM':
        # SVM model using countvectors and TF-IDF
        if feat == 'count':
            return Pipeline([('vect', CountVectorizer()), ('clf', SGDClassifier())])
        if feat == 'tfidf':
            return Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier())])

In [53]:
### train the data and make a prediction of the test data

def fit_predict(model, feat, x, y, test_x):
    clf = pipeline(model, feat).fit(x, y)
    pred = clf.predict(test_x)
    
    return pred


### See the performance of each model's error metric
from sklearn.metrics import confusion_matrix

def error_metric(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    true_neg, false_pos = cm[0]
    false_neg, true_pos = cm[1]
    accuracy = round((true_pos + true_neg) / (true_pos + true_neg + false_pos + false_neg),3)
    precision = round((true_pos) / (true_pos + false_pos),3)
    recall = round((true_pos) / (true_pos + false_neg),3)
    f1 = round(2 * (precision * recall) / (precision + recall),3)
    
    print('Accuracy: {}'.format(accuracy))
    print('Precision: {}'.format(precision))
    print('Recall: {}'.format(recall))
    print('F1 Score: {}'.format(f1))
    print()

# Logistic Regression classifier
predicted_LR_count = fit_predict('LR','count',train['text'],train['label'],test['text'])
predicted_LR_tfidf = fit_predict('LR','tfidf',train['text'],train['label'],test['text'])
print('Logistic Regression')
error_metric(test['label'],predicted_LR_count)
error_metric(test['label'],predicted_LR_tfidf)

# NB classifier
predicted_NB_count = fit_predict('NB','count',train['text'],train['label'],test['text'])
predicted_NB_tfidf = fit_predict('NB','tfidf',train['text'],train['label'],test['text'])
print('Naive Bayes')
error_metric(test['label'],predicted_NB_count)
error_metric(test['label'],predicted_NB_tfidf)

# SVM classifier
predicted_svm_count = fit_predict('SVM','count',train['text'],train['label'],test['text'])
predicted_svm_tfidf = fit_predict('SVM','tfidf',train['text'],train['label'],test['text'])
print('SVM')
error_metric(test['label'],predicted_svm_count)
error_metric(test['label'],predicted_svm_count)

Logistic Regression
Accuracy: 0.44
Precision: 0.375
Recall: 0.6
F1 Score: 0.462

Accuracy: 0.6
Precision: 0.5
Recall: 0.1
F1 Score: 0.167

Naive Bayes
Accuracy: 0.56
Precision: 0.4
Recall: 0.2
F1 Score: 0.267

Accuracy: 0.56
Precision: 0.0
Recall: 0.0
F1 Score: nan

SVM
Accuracy: 0.48
Precision: 0.412
Recall: 0.7
F1 Score: 0.519

Accuracy: 0.48
Precision: 0.412
Recall: 0.7
F1 Score: 0.519



  f1 = round(2 * (precision * recall) / (precision + recall),3)


In [54]:
### Parameter tuning by using gridsearch
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV

# check logistic regression vs. naive bayes model
# check countvector vs. tf-idf

unigram_log_pipe = Pipeline([('cv', CountVectorizer()),('logreg', LogisticRegression())])

ngram_pipe = Pipeline([('cv', CountVectorizer(ngram_range=(1, 2))), ('mnb', MultinomialNB())])

tfidf_pipe = Pipeline([('tfidf', TfidfVectorizer()), ('mnb', MultinomialNB())])

classifiers = [("ngram", ngram_pipe), ("unigram", unigram_log_pipe), ("tfidf", tfidf_pipe),]

mixed_pipe = Pipeline([("voting", VotingClassifier(classifiers, voting="soft"))])

def combinations_on_off(num_classifiers):
    return [[int(x) for x in list("{0:0b}".format(i).zfill(num_classifiers))]
            for i in range(1, 2 ** num_classifiers)]

param_grid = dict(voting__weights=combinations_on_off(len(classifiers)))
grid_search = GridSearchCV(mixed_pipe, param_grid=param_grid, n_jobs=-1, verbose=10, scoring="neg_log_loss")

grid_search.fit(train['text'],train['label'])

cv_results = grid_search.cv_results_

for mean_score, params in zip(cv_results["mean_test_score"], cv_results["params"]):
    print(params, mean_score)

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
# represents [ngram_pipe, unigram_log_pipe classifiers, tfidf_pipe]
# for example, [0, 0, 1] means we should use only tf-idf and not use bigrams nor logistic regression

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   12.0s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   13.8s
[Parallel(n_jobs=-1)]: Done  32 out of  35 | elapsed:   15.4s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:   15.7s finished


{'voting__weights': [0, 0, 1]} -0.6903748056575155
{'voting__weights': [0, 1, 0]} -0.9001929366617831
{'voting__weights': [0, 1, 1]} -0.7255989832187952
{'voting__weights': [1, 0, 0]} -5.161628084769083
{'voting__weights': [1, 0, 1]} -0.8728606507322535
{'voting__weights': [1, 1, 0]} -1.0171236426355847
{'voting__weights': [1, 1, 1]} -0.8080595851313813
Best score: -0.690
Best parameters set:
	voting__weights: [0, 0, 1]


In [55]:
# grid search for SVM model

param_svm = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'clf__max_iter': (20,),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
}

# find the best parameters for both the feature extraction and the classifier
def grid_search(pipeline, parameters, X_train, y_train):
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
    best_model = grid_search.fit(X_train, y_train)
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    return best_model
        
# SVM model grid search
best_svm_count = grid_search(pipeline('SVM','count'),param_svm,train['text'],train['label'])
best_svm_tfidf = grid_search(pipeline('SVM','tfidf'),param_svm,train['text'],train['label'])

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    2.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Best score: 0.584
Best parameters set:
	clf__alpha: 1e-05
	clf__max_iter: 20
	clf__penalty: 'l2'
	vect__max_df: 1.0
	vect__ngram_range: (1, 1)
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best score: 0.607
Best parameters set:
	clf__alpha: 1e-05
	clf__max_iter: 20
	clf__penalty: 'l2'
	vect__max_df: 1.0
	vect__ngram_range: (1, 2)


[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    2.9s finished


In [56]:
### use external data
# I will not use logistic regression model and countvectors
# since we see the result of grid search.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(external_data['text'], external_data['label'], test_size = 0.33, random_state = 0)

# NB classifier
predicted_NB_tfidf = fit_predict('NB','tfidf', X_train, y_train, X_test)

# SVM classifier
predicted_svm_count = fit_predict('SVM','count', X_train, y_train, X_test)
predicted_svm_tfidf = fit_predict('SVM','tfidf', X_train, y_train, X_test)

# additionally, grid search for the best SVM model
best_svm_count = grid_search(pipeline('SVM','count'),param_svm,X_train, y_train)
best_svm_tfidf = grid_search(pipeline('SVM','tfidf'),param_svm,X_train, y_train)
# and test those best models
best_svm_count.fit(X_train, y_train)
pred_best_count = best_svm_count.predict(X_test)
best_svm_tfidf.fit(X_train, y_train)
pred_best_tfidf = best_svm_tfidf.predict(X_test)


# show error metric of each model (using only external data for training and testing the model)
print('MultinomialNB model')
error_metric(y_test,predicted_NB_tfidf)

print('SVM models')
error_metric(y_test,predicted_svm_count)
error_metric(y_test,predicted_svm_tfidf)

print('SVM models after parameter tuning')
error_metric(y_test,pred_best_count)
error_metric(y_test,pred_best_tfidf)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   32.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Best score: 0.648
Best parameters set:
	clf__alpha: 1e-06
	clf__max_iter: 20
	clf__penalty: 'elasticnet'
	vect__max_df: 0.5
	vect__ngram_range: (1, 2)
Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   23.1s finished


Best score: 0.649
Best parameters set:
	clf__alpha: 1e-05
	clf__max_iter: 20
	clf__penalty: 'l2'
	vect__max_df: 0.75
	vect__ngram_range: (1, 2)
Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   30.4s finished


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   30.5s finished


MultinomialNB model
Accuracy: 0.644
Precision: 0.66
Recall: 0.56
F1 Score: 0.606

SVM models
Accuracy: 0.643
Precision: 0.658
Recall: 0.561
F1 Score: 0.606

Accuracy: 0.651
Precision: 0.656
Recall: 0.603
F1 Score: 0.628

SVM models after parameter tuning
Accuracy: 0.65
Precision: 0.648
Recall: 0.622
F1 Score: 0.635

Accuracy: 0.656
Precision: 0.643
Recall: 0.667
F1 Score: 0.655



In [57]:
# train the model with external data 
# and use provided test data for testing the model

# NB classifier
predicted_NB_tfidf = fit_predict('NB','tfidf', external_data['text'], external_data['label'], test['text'])

# SVM classifier
predicted_svm_count = fit_predict('SVM','count',external_data['text'], external_data['label'], test['text'])
predicted_svm_tfidf = fit_predict('SVM','tfidf',external_data['text'], external_data['label'], test['text'])

# additionally, grid search for the best SVM model
best_svm_count = grid_search(pipeline('SVM','count'),param_svm,external_data['text'], external_data['label'])
best_svm_tfidf = grid_search(pipeline('SVM','tfidf'),param_svm,external_data['text'], external_data['label'])
# and test those best models
best_svm_count.fit(X_train, y_train)
pred_best_count = best_svm_count.predict(test['text'])
best_svm_tfidf.fit(X_train, y_train)
pred_best_tfidf = best_svm_tfidf.predict(test['text'])

# show error metric of each model 
print('MultinomialNB model')
error_metric(test['label'],predicted_NB_tfidf)

print('SVM models')
error_metric(test['label'],predicted_svm_count)
error_metric(test['label'],predicted_svm_tfidf)

print('SVM models after parameter tuning')
error_metric(test['label'],pred_best_count)
error_metric(test['label'],pred_best_tfidf)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   34.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Best score: 0.652
Best parameters set:
	clf__alpha: 1e-06
	clf__max_iter: 20
	clf__penalty: 'elasticnet'
	vect__max_df: 0.75
	vect__ngram_range: (1, 2)
Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   41.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Best score: 0.659
Best parameters set:
	clf__alpha: 1e-05
	clf__max_iter: 20
	clf__penalty: 'elasticnet'
	vect__max_df: 1.0
	vect__ngram_range: (1, 2)
Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   26.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   26.2s finished


MultinomialNB model
Accuracy: 0.48
Precision: 0.429
Recall: 0.9
F1 Score: 0.581

SVM models
Accuracy: 0.48
Precision: 0.435
Recall: 1.0
F1 Score: 0.606

Accuracy: 0.44
Precision: 0.417
Recall: 1.0
F1 Score: 0.589

SVM models after parameter tuning
Accuracy: 0.56
Precision: 0.476
Recall: 1.0
F1 Score: 0.645

Accuracy: 0.48
Precision: 0.435
Recall: 1.0
F1 Score: 0.606



