In [1]:
import pandas as pd
from langdetect import detect
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report 
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords 
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sacremoses import MosesDetokenizer
import gensim

from gensim.models.coherencemodel import CoherenceModel
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.sklearn_api import D2VTransformer, LsiTransformer, LdaTransformer

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

np.random.seed(1999)

In [2]:
def lemmatize_stemming(text):
    stemmer = SnowballStemmer("english")
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))   
    return result

In [3]:
pol = pd.read_csv("Policy.csv", engine = "python", names = ["Policy", "Text"])
non_pol = pd.read_csv("No_Policy - Corpus.csv", engine = "python", names = ["Policy", "Text"])
df = pd.DataFrame.append(pol, non_pol)
df = df.sample(frac=1).reset_index(drop=True)
df['Policy'].loc[df['Policy'] == -1] = 0
df['Text_lang'] = df.apply(lambda row: detect(row['Text']), axis=1)
df = df[df.Text_lang.isin(['en'])]
stop_words = set(stopwords.words('english')) 
df['Text'] = df['Text'].map(preprocess)
df = df.dropna(axis= 0)
df = df.reset_index(drop = True)
mdtk = MosesDetokenizer()
df['Text'] = df['Text'].map(mdtk.detokenize)
#data_text = df[['Text']]
#data_text['index'] = data_text.index
#documents = data_text


#processed_docs = documents['Text'].map(preprocess)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [4]:
data_x = df[['Text']].as_matrix()
data_y = df.drop(['Text', 'Text_lang'], axis=1).as_matrix()
stratified_split = StratifiedShuffleSplit(n_splits=2, test_size=0.33)

documents = df[['Text']]
documents['index'] = documents.index

processed_docs = documents['Text'].map(preprocess)
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    count += 1
    if count > 10:
        break
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_wow = pd.Series((v for v in bow_corpus))


for train_index, test_index in stratified_split.split(data_x, data_y):
    x_train, x_test = data_x[train_index], data_x[test_index]
    y_train, y_test = data_y[train_index], data_y[test_index]
    
    common_texts_train = processed_docs[train_index]
    common_texts_test = processed_docs[test_index]

    bow_train = bow_wow[train_index].tolist()
    bow_test = bow_wow[test_index].tolist()
    
    

    
common_texts_train = common_texts_train.reset_index(drop = True).tolist()    
common_texts_test = common_texts_test.reset_index(drop = True).tolist()        
    
# transform matrix of plots into lists to pass to a TfidfVectorizer
train_x = [x[0].strip() for x in x_train.tolist()]
test_x = [x[0].strip() for x in x_test.tolist()]

#train_y = [x[0].strip() for x in y_train.tolist()]
#test_y = [x[0].strip() for x in y_test.tolist()]

  """Entry point for launching an IPython kernel.
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [5]:
pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words= stop_words)), 
                     ('clf', OneVsRestClassifier(LinearSVC()))])
parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    "clf__estimator__C": [0.01, 0.1, 0.5, 1],
    "clf__estimator__class_weight": ['balanced', None],
}

In [6]:
#train_x = map(int, train_x)
grid_search_tune = GridSearchCV(
    pipeline, parameters, cv=5, n_jobs=2, verbose=3)
grid_search_tune.fit(train_x, y_train)

print
print("Best parameters set:")
print (grid_search_tune.best_estimator_.steps)
print

# measuring performance on test set
print ("Applying best classifier on test data:")
best_clf = grid_search_tune.best_estimator_
predictions = best_clf.predict(test_x)

print (classification_report(y_test, predictions))

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:    8.2s
[Parallel(n_jobs=2)]: Done 124 tasks      | elapsed:   33.4s
[Parallel(n_jobs=2)]: Done 284 tasks      | elapsed:  1.3min
[Parallel(n_jobs=2)]: Done 360 out of 360 | elapsed:  1.6min finished


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words={'yours', 'from', 'had', 'hers', 'does', 'its', 'you', 'her', 'up', 'when', 'will', 'himself', 'some', 'there', 'our', 's', 'their', 'we', 'between', "haven't", 'doesn', 'themselves', "weren't", 'before', 'a', 'through', 'didn', 'same', 'the', "mustn't", 'being', "hasn't", 'off', 'him', '...ouldn't", 'it', "you've", 'mightn', "don't", 'hasn', 'of', 't', "doesn't", 'yourself', 'she', 'but'},
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', OneVsRestClassifier(estimator=LinearSVC(C=0.5, class_weight='balanced', dual=True, fit_intercept=True,
     interce

In [7]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'))),
])
parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    "clf__estimator__C": [0.01, 0.1, 0.4, 1],
    "clf__estimator__class_weight": ['balanced', None],
}

In [8]:
#train_x = map(int, train_x)
grid_search_tune = GridSearchCV(
    pipeline, parameters, cv=5, n_jobs=2, verbose=3)
grid_search_tune.fit(train_x, y_train)

print
print("Best parameters set:")
print (grid_search_tune.best_estimator_.steps)
print

# measuring performance on test set
print ("Applying best classifier on test data:")
best_clf = grid_search_tune.best_estimator_
predictions = best_clf.predict(test_x)

print (classification_report(y_test, predictions))

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  38 tasks      | elapsed:   10.3s
[Parallel(n_jobs=2)]: Done 134 tasks      | elapsed:   43.4s
[Parallel(n_jobs=2)]: Done 294 tasks      | elapsed:  1.5min
[Parallel(n_jobs=2)]: Done 360 out of 360 | elapsed:  1.9min finished


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.75, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words={'yours', 'from', 'had', 'hers', 'does', 'its', 'you', 'her', 'up', 'when', 'will', 'himself', 'some', 'there', 'our', 's', 'their', 'we', 'between', "haven't", 'doesn', 'themselves', "weren't", 'before', 'a', 'through', 'didn', 'same', 'the', "mustn't", 'being', "hasn't", 'off', 'him', '...ouldn't", 'it', "you've", 'mightn', "don't", 'hasn', 'of', 't', "doesn't", 'yourself', 'she', 'but'},
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', OneVsRestClassifier(estimator=LogisticRegression(C=0.01, class_weight='balanced', dual=False,
          fit_interc

In [9]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('clf', OneVsRestClassifier(DecisionTreeClassifier())),
])
parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    "clf__estimator__criterion": ['gini', 'entropy'],
    "clf__estimator__splitter": ['best', 'random'],
    "clf__estimator__max_features": ['auto', 'sqrt', 'log2', None],
}

In [10]:
#train_x = map(int, train_x)
grid_search_tune = GridSearchCV(
    pipeline, parameters, cv=5, n_jobs=2, verbose=3)
grid_search_tune.fit(train_x, y_train)

print
print("Best parameters set:")
print (grid_search_tune.best_estimator_.steps)
print

# measuring performance on test set
print ("Applying best classifier on test data:")
best_clf = grid_search_tune.best_estimator_
predictions = best_clf.predict(test_x)

print (classification_report(y_test, predictions))

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  41 tasks      | elapsed:    9.5s
[Parallel(n_jobs=2)]: Done 137 tasks      | elapsed:   35.4s
[Parallel(n_jobs=2)]: Done 297 tasks      | elapsed:  1.3min
[Parallel(n_jobs=2)]: Done 521 tasks      | elapsed:  2.4min
[Parallel(n_jobs=2)]: Done 720 out of 720 | elapsed:  3.3min finished


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words={'yours', 'from', 'had', 'hers', 'does', 'its', 'you', 'her', 'up', 'when', 'will', 'himself', 'some', 'there', 'our', 's', 'their', 'we', 'between', "haven't", 'doesn', 'themselves', "weren't", 'before', 'a', 'through', 'didn', 'same', 'the', "mustn't", 'being', "hasn't", 'off', 'him', '...ouldn't", 'it', "you've", 'mightn', "don't", 'hasn', 'of', 't', "doesn't", 'yourself', 'she', 'but'},
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', OneVsRestClassifier(estimator=DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
      

In [11]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('clf', OneVsRestClassifier(KNeighborsClassifier())),
])
parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    "clf__estimator__n_neighbors": (2,3,4,5,6,7,8),
    "clf__estimator__weights": ['uniform', 'distance'],
    "clf__estimator__algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute'],
}

In [12]:
#train_x = map(int, train_x)
grid_search_tune = GridSearchCV(
    pipeline, parameters, cv=5, n_jobs=2, verbose=3)
grid_search_tune.fit(train_x, y_train)

print
print("Best parameters set:")
print (grid_search_tune.best_estimator_.steps)
print

# measuring performance on test set
print ("Applying best classifier on test data:")
best_clf = grid_search_tune.best_estimator_
predictions = best_clf.predict(test_x)

print (classification_report(y_test, predictions))

Fitting 5 folds for each of 504 candidates, totalling 2520 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  48 tasks      | elapsed:   12.1s
[Parallel(n_jobs=2)]: Done 145 tasks      | elapsed:   39.0s
[Parallel(n_jobs=2)]: Done 305 tasks      | elapsed:  1.4min
[Parallel(n_jobs=2)]: Done 529 tasks      | elapsed:  2.4min
[Parallel(n_jobs=2)]: Done 817 tasks      | elapsed:  3.7min
[Parallel(n_jobs=2)]: Done 1169 tasks      | elapsed:  5.3min
[Parallel(n_jobs=2)]: Done 1585 tasks      | elapsed:  7.2min
[Parallel(n_jobs=2)]: Done 2065 tasks      | elapsed:  9.4min
[Parallel(n_jobs=2)]: Done 2520 out of 2520 | elapsed: 11.5min finished


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words={'yours', 'from', 'had', 'hers', 'does', 'its', 'you', 'her', 'up', 'when', 'will', 'himself', 'some', 'there', 'our', 's', 'their', 'we', 'between', "haven't", 'doesn', 'themselves', "weren't", 'before', 'a', 'through', 'didn', 'same', 'the', "mustn't", 'being', "hasn't", 'off', 'him', '...ouldn't", 'it', "you've", 'mightn', "don't", 'hasn', 'of', 't', "doesn't", 'yourself', 'she', 'but'},
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', OneVsRestClassifier(estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           m

In [13]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('clf', OneVsRestClassifier(RandomForestClassifier())),
])
parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    "clf__estimator__criterion": ['gini', 'entropy'],
    "clf__estimator__max_features": ['auto', 'sqrt', 'log2', None],
}

In [14]:
#train_x = map(int, train_x)
grid_search_tune = GridSearchCV(
    pipeline, parameters, cv=5, n_jobs=2, verbose=3)
grid_search_tune.fit(train_x, y_train)

print
print("Best parameters set:")
print (grid_search_tune.best_estimator_.steps)
print

# measuring performance on test set
print ("Applying best classifier on test data:")
best_clf = grid_search_tune.best_estimator_
predictions = best_clf.predict(test_x)

print (classification_report(y_test, predictions))

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  43 tasks      | elapsed:   10.5s
[Parallel(n_jobs=2)]: Done 139 tasks      | elapsed:   39.9s
[Parallel(n_jobs=2)]: Done 299 tasks      | elapsed:  1.6min
[Parallel(n_jobs=2)]: Done 360 out of 360 | elapsed:  2.0min finished


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.75, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words={'yours', 'from', 'had', 'hers', 'does', 'its', 'you', 'her', 'up', 'when', 'will', 'himself', 'some', 'there', 'our', 's', 'their', 'we', 'between', "haven't", 'doesn', 'themselves', "weren't", 'before', 'a', 'through', 'didn', 'same', 'the', "mustn't", 'being', "hasn't", 'off', 'him', '...ouldn't", 'it', "you've", 'mightn', "don't", 'hasn', 'of', 't', "doesn't", 'yourself', 'she', 'but'},
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
        

In [15]:
pipeline = Pipeline([
    ('lda', LdaTransformer(num_topics=2, id2word=dictionary, iterations=14, random_state=17624)),
    ('clf', OneVsRestClassifier(LinearSVC())),
])
parameters = {
    'lda__num_topics': (2,3,4,5,6,7),
    'lda__iterations': (14,15,16,17,18,19,20),
    "clf__estimator__C": [0.01, 0.1, 0.5, 1],
    "clf__estimator__class_weight": ['balanced', None],
}

In [16]:
#train_x = map(int, train_x)
grid_search_tune = GridSearchCV(
    pipeline, parameters, cv=5, n_jobs=2, verbose=3)
grid_search_tune.fit(bow_train, y_train)

print
print("Best parameters set:")
print (grid_search_tune.best_estimator_.steps)
print

# measuring performance on test set
print ("Applying best classifier on test data:")
best_clf = grid_search_tune.best_estimator_
predictions = best_clf.predict(bow_test)

print (classification_report(y_test, predictions))

Fitting 5 folds for each of 336 candidates, totalling 1680 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:   11.5s
[Parallel(n_jobs=2)]: Done 124 tasks      | elapsed:   26.3s
[Parallel(n_jobs=2)]: Done 284 tasks      | elapsed:   52.2s
[Parallel(n_jobs=2)]: Done 508 tasks      | elapsed:  1.5min
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:  2.2min
[Parallel(n_jobs=2)]: Done 1148 tasks      | elapsed:  3.1min
[Parallel(n_jobs=2)]: Done 1564 tasks      | elapsed:  4.1min
[Parallel(n_jobs=2)]: Done 1680 out of 1680 | elapsed:  4.3min finished


Best parameters set:
[('lda', LdaTransformer(alpha='symmetric', chunksize=2000, decay=0.5,
        dtype=<class 'numpy.float32'>, eta=None, eval_every=10,
        gamma_threshold=0.001,
        id2word=<gensim.corpora.dictionary.Dictionary object at 0x00000207AF4EF518>,
        iterations=14, minimum_probability=0.01, num_topics=2, offset=1.0,
        passes=1, random_state=17624, scorer='perplexity', update_every=1)), ('clf', OneVsRestClassifier(estimator=LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=None))]
Applying best classifier on test data:


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       0.73      1.00      0.85        47
           1       0.00      0.00      0.00        17

   micro avg       0.73      0.73      0.73        64
   macro avg       0.37      0.50      0.42        64
weighted avg       0.54      0.73      0.62        64



In [17]:
pipeline = Pipeline([
    ('lda', LdaTransformer(num_topics=2, id2word= dictionary, iterations=14, random_state=17624)),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'))),
])
parameters = {
    'lda__num_topics': (2,3,4,5,6,7),
    'lda__iterations': (14,15,16,17,18,19,20),
    "clf__estimator__C": [0.01, 0.1, 0.4, 1],
    "clf__estimator__class_weight": ['balanced', None],
}

In [18]:
#train_x = map(int, train_x)
grid_search_tune = GridSearchCV(
    pipeline, parameters, cv=5, n_jobs=2, verbose=3)
grid_search_tune.fit(bow_train, y_train)

print
print("Best parameters set:")
print (grid_search_tune.best_estimator_.steps)
print

# measuring performance on test set
print ("Applying best classifier on test data:")
best_clf = grid_search_tune.best_estimator_
predictions = best_clf.predict(bow_test)

print (classification_report(y_test, predictions))

Fitting 5 folds for each of 336 candidates, totalling 1680 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:    3.8s
[Parallel(n_jobs=2)]: Done 124 tasks      | elapsed:   16.1s
[Parallel(n_jobs=2)]: Done 284 tasks      | elapsed:   37.8s
[Parallel(n_jobs=2)]: Done 508 tasks      | elapsed:  1.1min
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:  1.8min
[Parallel(n_jobs=2)]: Done 1148 tasks      | elapsed:  2.6min
[Parallel(n_jobs=2)]: Done 1564 tasks      | elapsed:  3.6min
[Parallel(n_jobs=2)]: Done 1680 out of 1680 | elapsed:  3.9min finished


Best parameters set:
[('lda', LdaTransformer(alpha='symmetric', chunksize=2000, decay=0.5,
        dtype=<class 'numpy.float32'>, eta=None, eval_every=10,
        gamma_threshold=0.001,
        id2word=<gensim.corpora.dictionary.Dictionary object at 0x00000207AF4BC550>,
        iterations=14, minimum_probability=0.01, num_topics=2, offset=1.0,
        passes=1, random_state=17624, scorer='perplexity', update_every=1)), ('clf', OneVsRestClassifier(estimator=LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='sag',
          tol=0.0001, verbose=0, warm_start=False),
          n_jobs=None))]
Applying best classifier on test data:


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       0.73      1.00      0.85        47
           1       0.00      0.00      0.00        17

   micro avg       0.73      0.73      0.73        64
   macro avg       0.37      0.50      0.42        64
weighted avg       0.54      0.73      0.62        64



In [19]:
pipeline = Pipeline([
    ('lda', LdaTransformer(num_topics=2, id2word= dictionary, iterations=14, random_state=17624)),
    ('clf', OneVsRestClassifier(DecisionTreeClassifier())),
])
parameters = {
    'lda__num_topics': (2,3,4,5,6,7),
    'lda__iterations': (14,15,16,17,18,19,20),
    "clf__estimator__criterion": ['gini', 'entropy'],
    "clf__estimator__splitter": ['best', 'random'],
    "clf__estimator__max_features": ['auto', 'sqrt', 'log2', None],
}

In [20]:
#train_x = map(int, train_x)
grid_search_tune = GridSearchCV(
    pipeline, parameters, cv=5, n_jobs=2, verbose=3)
grid_search_tune.fit(bow_train, y_train)

print
print("Best parameters set:")
print (grid_search_tune.best_estimator_.steps)
print

# measuring performance on test set
print ("Applying best classifier on test data:")
best_clf = grid_search_tune.best_estimator_
predictions = best_clf.predict(bow_test)

print (classification_report(y_test, predictions))

Fitting 5 folds for each of 672 candidates, totalling 3360 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:    4.1s
[Parallel(n_jobs=2)]: Done 124 tasks      | elapsed:   18.0s
[Parallel(n_jobs=2)]: Done 284 tasks      | elapsed:   40.1s
[Parallel(n_jobs=2)]: Done 508 tasks      | elapsed:  1.2min
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:  1.8min
[Parallel(n_jobs=2)]: Done 1148 tasks      | elapsed:  2.6min
[Parallel(n_jobs=2)]: Done 1564 tasks      | elapsed:  3.5min
[Parallel(n_jobs=2)]: Done 2044 tasks      | elapsed:  4.5min
[Parallel(n_jobs=2)]: Done 2588 tasks      | elapsed:  5.7min
[Parallel(n_jobs=2)]: Done 3196 tasks      | elapsed:  7.0min
[Parallel(n_jobs=2)]: Done 3360 out of 3360 | elapsed:  7.4min finished


Best parameters set:
[('lda', LdaTransformer(alpha='symmetric', chunksize=2000, decay=0.5,
        dtype=<class 'numpy.float32'>, eta=None, eval_every=10,
        gamma_threshold=0.001,
        id2word=<gensim.corpora.dictionary.Dictionary object at 0x00000207AD703D68>,
        iterations=17, minimum_probability=0.01, num_topics=6, offset=1.0,
        passes=1, random_state=17624, scorer='perplexity', update_every=1)), ('clf', OneVsRestClassifier(estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          n_jobs=None))]
Applying best classifier on test data:
              precision    recall  f1-score   support

           0       0.93      0.89      0.91        47
           1 

In [21]:
pipeline = Pipeline([
    ('lda', LdaTransformer(num_topics=2, id2word= dictionary, iterations=14, random_state=17624)),
    ('clf', OneVsRestClassifier(KNeighborsClassifier())),
])
parameters = {
    'lda__num_topics': (2,3,4,5,6,7),
    'lda__iterations': (14,15,16,17,18,19,20),
    "clf__estimator__n_neighbors": (2,3,4,5,6,7,8),
    "clf__estimator__weights": ['uniform', 'distance'],
    "clf__estimator__algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute'],
}

In [22]:
#train_x = map(int, train_x)
grid_search_tune = GridSearchCV(
    pipeline, parameters, cv=5, n_jobs=2, verbose=3)
grid_search_tune.fit(bow_train, y_train)

print
print("Best parameters set:")
print (grid_search_tune.best_estimator_.steps)
print

# measuring performance on test set
print ("Applying best classifier on test data:")
best_clf = grid_search_tune.best_estimator_
predictions = best_clf.predict(bow_test)

print (classification_report(y_test, predictions))

Fitting 5 folds for each of 2352 candidates, totalling 11760 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:    3.7s
[Parallel(n_jobs=2)]: Done 124 tasks      | elapsed:   16.2s
[Parallel(n_jobs=2)]: Done 284 tasks      | elapsed:   38.4s
[Parallel(n_jobs=2)]: Done 508 tasks      | elapsed:  1.2min
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:  1.9min
[Parallel(n_jobs=2)]: Done 1148 tasks      | elapsed:  2.6min
[Parallel(n_jobs=2)]: Done 1564 tasks      | elapsed:  3.6min
[Parallel(n_jobs=2)]: Done 2044 tasks      | elapsed:  4.7min
[Parallel(n_jobs=2)]: Done 2588 tasks      | elapsed:  5.8min
[Parallel(n_jobs=2)]: Done 3196 tasks      | elapsed:  7.1min
[Parallel(n_jobs=2)]: Done 3868 tasks      | elapsed:  8.6min
[Parallel(n_jobs=2)]: Done 4604 tasks      | elapsed: 10.2min
[Parallel(n_jobs=2)]: Done 5404 tasks      | elapsed: 12.0min
[Parallel(n_jobs=2)]: Done 6268 tasks      | elapsed: 13.8min
[Parallel(n_jobs=2)]: Done 7196 tasks      | elapsed: 15.8min


Best parameters set:
[('lda', LdaTransformer(alpha='symmetric', chunksize=2000, decay=0.5,
        dtype=<class 'numpy.float32'>, eta=None, eval_every=10,
        gamma_threshold=0.001,
        id2word=<gensim.corpora.dictionary.Dictionary object at 0x00000207AF775550>,
        iterations=16, minimum_probability=0.01, num_topics=5, offset=1.0,
        passes=1, random_state=17624, scorer='perplexity', update_every=1)), ('clf', OneVsRestClassifier(estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=2, p=2,
           weights='uniform'),
          n_jobs=None))]
Applying best classifier on test data:
              precision    recall  f1-score   support

           0       0.88      0.91      0.90        47
           1       0.73      0.65      0.69        17

   micro avg       0.84      0.84      0.84        64
   macro avg       0.81      0.78      0.79        64
weighted avg       0.84      0.84  

In [23]:
pipeline = Pipeline([
    ('lda', LdaTransformer(num_topics=2, id2word= dictionary, iterations=14, random_state=17624)),
    ('clf', OneVsRestClassifier(RandomForestClassifier())),
])
parameters = {
    'lda__num_topics': (2,3,4,5,6,7,8),
    'lda__iterations': (14,15,16,17,18,19,20),
    "clf__estimator__criterion": ['gini', 'entropy'],
    "clf__estimator__max_features": ['auto', 'sqrt', 'log2', None],
}

In [24]:
#train_x = map(int, train_x)
grid_search_tune = GridSearchCV(
    pipeline, parameters, cv=5, n_jobs=2, verbose=3)
grid_search_tune.fit(bow_train, y_train)

print
print("Best parameters set:")
print (grid_search_tune.best_estimator_.steps)
print

# measuring performance on test set
print ("Applying best classifier on test data:")
best_clf = grid_search_tune.best_estimator_
predictions = best_clf.predict(bow_test)

print (classification_report(y_test, predictions))

Fitting 5 folds for each of 392 candidates, totalling 1960 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:    3.7s
[Parallel(n_jobs=2)]: Done 124 tasks      | elapsed:   16.7s
[Parallel(n_jobs=2)]: Done 284 tasks      | elapsed:   39.1s
[Parallel(n_jobs=2)]: Done 508 tasks      | elapsed:  1.2min
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:  1.8min
[Parallel(n_jobs=2)]: Done 1148 tasks      | elapsed:  2.7min
[Parallel(n_jobs=2)]: Done 1564 tasks      | elapsed:  3.8min
[Parallel(n_jobs=2)]: Done 1960 out of 1960 | elapsed:  4.7min finished


Best parameters set:
[('lda', LdaTransformer(alpha='symmetric', chunksize=2000, decay=0.5,
        dtype=<class 'numpy.float32'>, eta=None, eval_every=10,
        gamma_threshold=0.001,
        id2word=<gensim.corpora.dictionary.Dictionary object at 0x00000207ACAFF390>,
        iterations=18, minimum_probability=0.01, num_topics=8, offset=1.0,
        passes=1, random_state=17624, scorer='perplexity', update_every=1)), ('clf', OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          n_jobs=None))]
Applying best classifier on test data:
              precision    recall  f1-score

In [25]:
pipeline = Pipeline([
    ('d2v', D2VTransformer(min_count=1, size=5)),
    ('clf', OneVsRestClassifier(LinearSVC())),
])
parameters = {
    'd2v__min_count': (1, 2,3,4,5,6,7,8,9,10),
    'd2v__size': (5,6,7,8,9,10),
    "clf__estimator__C": [0.01, 0.1, 0.5, 1],
    "clf__estimator__class_weight": ['balanced', None],
}

In [26]:
#train_x = map(int, train_x)
grid_search_tune = GridSearchCV(
    pipeline, parameters, cv=5, n_jobs=2, verbose=3)
grid_search_tune.fit(common_texts_train, y_train)

print
print("Best parameters set:")
print (grid_search_tune.best_estimator_.steps)
print

# measuring performance on test set
print ("Applying best classifier on test data:")
best_clf = grid_search_tune.best_estimator_
predictions = best_clf.predict(common_texts_test)

print (classification_report(y_test, predictions))

Fitting 5 folds for each of 480 candidates, totalling 2400 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:   21.3s
[Parallel(n_jobs=2)]: Done 124 tasks      | elapsed:  1.4min
[Parallel(n_jobs=2)]: Done 284 tasks      | elapsed:  3.0min
[Parallel(n_jobs=2)]: Done 508 tasks      | elapsed:  5.3min
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:  8.3min
[Parallel(n_jobs=2)]: Done 1148 tasks      | elapsed: 12.0min
[Parallel(n_jobs=2)]: Done 1564 tasks      | elapsed: 16.3min
[Parallel(n_jobs=2)]: Done 2044 tasks      | elapsed: 21.1min
[Parallel(n_jobs=2)]: Done 2400 out of 2400 | elapsed: 24.7min finished


Best parameters set:
[('d2v', D2VTransformer(alpha=0.025, batch_words=10000, cbow_mean=1, comment=None,
        dbow_words=0, dm=1, dm_concat=0, dm_mean=None, dm_tag_count=1,
        docvecs=None, docvecs_mapfile=None,
        hashfxn=<built-in function hash>, hs=0, iter=5,
        max_vocab_size=None, min_alpha=0.0001, min_count=1, negative=5,
        sample=0.001, seed=1, size=6, sorted_vocab=1, trim_rule=None,
        window=5, workers=3)), ('clf', OneVsRestClassifier(estimator=LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=None))]
Applying best classifier on test data:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        47
           1       1.00      0.82      0.90        17

   micro avg       0.95      0.95      0.95        64
   macro avg   

In [27]:
pipeline = Pipeline([
    ('d2v', D2VTransformer(min_count=1, size=5)),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'))),
])
parameters = {
    'd2v__min_count': (1, 2,3,4,5,6,7,8),
    'd2v__size': (5,6,7,8,9,10),
    "clf__estimator__C": [0.01, 0.1, 0.4, 1],
    "clf__estimator__class_weight": ['balanced', None],
}

In [28]:
#train_x = map(int, train_x)
grid_search_tune = GridSearchCV(
    pipeline, parameters, cv=5, n_jobs=2, verbose=3)
grid_search_tune.fit(common_texts_train, y_train)

print
print("Best parameters set:")
print (grid_search_tune.best_estimator_.steps)
print

# measuring performance on test set
print ("Applying best classifier on test data:")
best_clf = grid_search_tune.best_estimator_
predictions = best_clf.predict(common_texts_test)

print (classification_report(y_test, predictions))

Fitting 5 folds for each of 384 candidates, totalling 1920 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:   22.7s
[Parallel(n_jobs=2)]: Done 124 tasks      | elapsed:  1.5min
[Parallel(n_jobs=2)]: Done 284 tasks      | elapsed:  3.1min
[Parallel(n_jobs=2)]: Done 508 tasks      | elapsed:  5.6min
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:  9.0min
[Parallel(n_jobs=2)]: Done 1148 tasks      | elapsed: 12.9min
[Parallel(n_jobs=2)]: Done 1564 tasks      | elapsed: 17.4min
[Parallel(n_jobs=2)]: Done 1920 out of 1920 | elapsed: 21.3min finished


Best parameters set:
[('d2v', D2VTransformer(alpha=0.025, batch_words=10000, cbow_mean=1, comment=None,
        dbow_words=0, dm=1, dm_concat=0, dm_mean=None, dm_tag_count=1,
        docvecs=None, docvecs_mapfile=None,
        hashfxn=<built-in function hash>, hs=0, iter=5,
        max_vocab_size=None, min_alpha=0.0001, min_count=5, negative=5,
        sample=0.001, seed=1, size=6, sorted_vocab=1, trim_rule=None,
        window=5, workers=3)), ('clf', OneVsRestClassifier(estimator=LogisticRegression(C=0.4, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='sag',
          tol=0.0001, verbose=0, warm_start=False),
          n_jobs=None))]
Applying best classifier on test data:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        47
           1       1.00      0.82      0.90        17

   micro avg       0.9

In [29]:
pipeline = Pipeline([
    ('d2v', D2VTransformer(min_count=1, size=5)),
    ('clf', OneVsRestClassifier(DecisionTreeClassifier())),
])
parameters = {
    'd2v__min_count': (1, 2,3,4,5,6,7),
    'd2v__size': (5,6,7,8,9,10),
    "clf__estimator__criterion": ['gini', 'entropy'],
    "clf__estimator__splitter": ['best', 'random'],
    "clf__estimator__max_features": ['auto', 'sqrt', 'log2', None],
}

In [30]:
#train_x = map(int, train_x)
grid_search_tune = GridSearchCV(
    pipeline, parameters, cv=5, n_jobs=2, verbose=3)
grid_search_tune.fit(common_texts_train, y_train)

print
print("Best parameters set:")
print (grid_search_tune.best_estimator_.steps)
print

# measuring performance on test set
print ("Applying best classifier on test data:")
best_clf = grid_search_tune.best_estimator_
predictions = best_clf.predict(common_texts_test)

print (classification_report(y_test, predictions))

Fitting 5 folds for each of 672 candidates, totalling 3360 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:   21.7s
[Parallel(n_jobs=2)]: Done 124 tasks      | elapsed:  1.4min
[Parallel(n_jobs=2)]: Done 284 tasks      | elapsed:  3.2min
[Parallel(n_jobs=2)]: Done 508 tasks      | elapsed:  5.5min
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:  8.5min
[Parallel(n_jobs=2)]: Done 1148 tasks      | elapsed: 12.3min
[Parallel(n_jobs=2)]: Done 1564 tasks      | elapsed: 16.8min
[Parallel(n_jobs=2)]: Done 2044 tasks      | elapsed: 21.9min
[Parallel(n_jobs=2)]: Done 2588 tasks      | elapsed: 28.5min
[Parallel(n_jobs=2)]: Done 3196 tasks      | elapsed: 36.0min
[Parallel(n_jobs=2)]: Done 3360 out of 3360 | elapsed: 37.9min finished


Best parameters set:
[('d2v', D2VTransformer(alpha=0.025, batch_words=10000, cbow_mean=1, comment=None,
        dbow_words=0, dm=1, dm_concat=0, dm_mean=None, dm_tag_count=1,
        docvecs=None, docvecs_mapfile=None,
        hashfxn=<built-in function hash>, hs=0, iter=5,
        max_vocab_size=None, min_alpha=0.0001, min_count=1, negative=5,
        sample=0.001, seed=1, size=10, sorted_vocab=1, trim_rule=None,
        window=5, workers=3)), ('clf', OneVsRestClassifier(estimator=DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          n_jobs=None))]
Applying best classifier on test data:
              precision    recall  f1-score   support

           0       0.94      1.00      0

In [31]:
pipeline = Pipeline([
    ('d2v', D2VTransformer(min_count=1, size=5)),
    ('clf', OneVsRestClassifier(KNeighborsClassifier())),
])
parameters = {
    'd2v__min_count': (1, 2,3,4,5,6,7,8),
    'd2v__size': (5,6,7,8,9,10),
    "clf__estimator__n_neighbors": (2,3,4,5,6,7),
    "clf__estimator__weights": ['uniform', 'distance'],
    "clf__estimator__algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute'],
}

In [32]:
#train_x = map(int, train_x)
grid_search_tune = GridSearchCV(
    pipeline, parameters, cv=5, n_jobs=2, verbose=3)
grid_search_tune.fit(common_texts_train, y_train)

print
print("Best parameters set:")
print (grid_search_tune.best_estimator_.steps)
print

# measuring performance on test set
print ("Applying best classifier on test data:")
best_clf = grid_search_tune.best_estimator_
predictions = best_clf.predict(common_texts_test)

print (classification_report(y_test, predictions))

Fitting 5 folds for each of 2304 candidates, totalling 11520 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:   24.3s
[Parallel(n_jobs=2)]: Done 124 tasks      | elapsed:  1.6min
[Parallel(n_jobs=2)]: Done 284 tasks      | elapsed:  3.6min
[Parallel(n_jobs=2)]: Done 508 tasks      | elapsed:  6.3min
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:  9.9min
[Parallel(n_jobs=2)]: Done 1148 tasks      | elapsed: 14.1min
[Parallel(n_jobs=2)]: Done 1564 tasks      | elapsed: 19.0min
[Parallel(n_jobs=2)]: Done 2044 tasks      | elapsed: 24.1min
[Parallel(n_jobs=2)]: Done 2588 tasks      | elapsed: 29.9min
[Parallel(n_jobs=2)]: Done 3196 tasks      | elapsed: 36.3min
[Parallel(n_jobs=2)]: Done 3868 tasks      | elapsed: 43.4min
[Parallel(n_jobs=2)]: Done 4604 tasks      | elapsed: 51.5min
[Parallel(n_jobs=2)]: Done 5404 tasks      | elapsed: 60.1min
[Parallel(n_jobs=2)]: Done 6268 tasks      | elapsed: 69.1min
[Parallel(n_jobs=2)]: Done 7196 tasks      | elapsed: 78.7min


Best parameters set:
[('d2v', D2VTransformer(alpha=0.025, batch_words=10000, cbow_mean=1, comment=None,
        dbow_words=0, dm=1, dm_concat=0, dm_mean=None, dm_tag_count=1,
        docvecs=None, docvecs_mapfile=None,
        hashfxn=<built-in function hash>, hs=0, iter=5,
        max_vocab_size=None, min_alpha=0.0001, min_count=2, negative=5,
        sample=0.001, seed=1, size=7, sorted_vocab=1, trim_rule=None,
        window=5, workers=3)), ('clf', OneVsRestClassifier(estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=2, p=2,
           weights='uniform'),
          n_jobs=None))]
Applying best classifier on test data:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        47
           1       1.00      0.82      0.90        17

   micro avg       0.95      0.95      0.95        64
   macro avg       0.97      0.91      0.94        64
weighted a

In [33]:
pipeline = Pipeline([
    ('d2v', D2VTransformer(min_count=1, size=5)),
    ('clf', OneVsRestClassifier(RandomForestClassifier())),
])
parameters = {
    'd2v__min_count': (1, 2,3,4,5,6,7,8,9,10),
    'd2v__size': (5,6,7,8,9,10),
    "clf__estimator__criterion": ['gini', 'entropy'],
    "clf__estimator__max_features": ['auto', 'sqrt', 'log2', None],
}

In [34]:
#train_x = map(int, train_x)
grid_search_tune = GridSearchCV(
    pipeline, parameters, cv=5, n_jobs=2, verbose=3)
grid_search_tune.fit(common_texts_train, y_train)

print
print("Best parameters set:")
print (grid_search_tune.best_estimator_.steps)
print

# measuring performance on test set
print ("Applying best classifier on test data:")
best_clf = grid_search_tune.best_estimator_
predictions = best_clf.predict(common_texts_test)

print (classification_report(y_test, predictions))

Fitting 5 folds for each of 480 candidates, totalling 2400 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:   24.6s
[Parallel(n_jobs=2)]: Done 124 tasks      | elapsed:  1.5min
[Parallel(n_jobs=2)]: Done 284 tasks      | elapsed:  3.1min
[Parallel(n_jobs=2)]: Done 508 tasks      | elapsed:  5.9min
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:  9.0min
[Parallel(n_jobs=2)]: Done 1148 tasks      | elapsed: 12.9min
[Parallel(n_jobs=2)]: Done 1564 tasks      | elapsed: 17.5min
[Parallel(n_jobs=2)]: Done 2044 tasks      | elapsed: 22.5min
[Parallel(n_jobs=2)]: Done 2400 out of 2400 | elapsed: 26.0min finished


Best parameters set:
[('d2v', D2VTransformer(alpha=0.025, batch_words=10000, cbow_mean=1, comment=None,
        dbow_words=0, dm=1, dm_concat=0, dm_mean=None, dm_tag_count=1,
        docvecs=None, docvecs_mapfile=None,
        hashfxn=<built-in function hash>, hs=0, iter=5,
        max_vocab_size=None, min_alpha=0.0001, min_count=2, negative=5,
        sample=0.001, seed=1, size=7, sorted_vocab=1, trim_rule=None,
        window=5, workers=3)), ('clf', OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          n_jobs=None))]
Applying best classifier on test data:
              precisi

In [35]:
pipeline = Pipeline([
    ('lsa', LsiTransformer(num_topics=2, id2word= dictionary)),
    ('clf', OneVsRestClassifier(LinearSVC())),
])
parameters = {
    'lsa__num_topics': (2,3,4,5,6,7),
    "clf__estimator__C": [0.01, 0.1, 0.5, 1],
    "clf__estimator__class_weight": ['balanced', None],
}

In [36]:
#train_x = map(int, train_x)
grid_search_tune = GridSearchCV(
    pipeline, parameters, cv=5, n_jobs=2, verbose=3)
grid_search_tune.fit(bow_train, y_train)

print
print("Best parameters set:")
print (grid_search_tune.best_estimator_.steps)
print

# measuring performance on test set
print ("Applying best classifier on test data:")
best_clf = grid_search_tune.best_estimator_
predictions = best_clf.predict(bow_test)

print (classification_report(y_test, predictions))

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:    2.9s
[Parallel(n_jobs=2)]: Done 124 tasks      | elapsed:   13.3s
[Parallel(n_jobs=2)]: Done 240 out of 240 | elapsed:   25.7s finished


Best parameters set:
[('lsa', LsiTransformer(chunksize=20000, decay=1.0, extra_samples=100,
        id2word=<gensim.corpora.dictionary.Dictionary object at 0x00000207AD11AB00>,
        num_topics=3, onepass=True, power_iters=2)), ('clf', OneVsRestClassifier(estimator=LinearSVC(C=0.1, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=None))]
Applying best classifier on test data:
              precision    recall  f1-score   support

           0       0.94      0.96      0.95        47
           1       0.88      0.82      0.85        17

   micro avg       0.92      0.92      0.92        64
   macro avg       0.91      0.89      0.90        64
weighted avg       0.92      0.92      0.92        64



In [37]:
pipeline = Pipeline([
    ('lsa', LsiTransformer(num_topics=2, id2word= dictionary)),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'))),
])
parameters = {
    'lsa__num_topics': (2,3,4,5,6,7),
    "clf__estimator__C": [0.01, 0.1, 0.4, 1],
    "clf__estimator__class_weight": ['balanced', None],
}

In [38]:
#train_x = map(int, train_x)
grid_search_tune = GridSearchCV(
    pipeline, parameters, cv=5, n_jobs=2, verbose=3)
grid_search_tune.fit(bow_train, y_train)

print
print("Best parameters set:")
print (grid_search_tune.best_estimator_.steps)
print

# measuring performance on test set
print ("Applying best classifier on test data:")
best_clf = grid_search_tune.best_estimator_
predictions = best_clf.predict(bow_test)

print (classification_report(y_test, predictions))

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:    3.1s
[Parallel(n_jobs=2)]: Done 124 tasks      | elapsed:   13.3s
[Parallel(n_jobs=2)]: Done 240 out of 240 | elapsed:   25.8s finished


Best parameters set:
[('lsa', LsiTransformer(chunksize=20000, decay=1.0, extra_samples=100,
        id2word=<gensim.corpora.dictionary.Dictionary object at 0x00000207AF896320>,
        num_topics=7, onepass=True, power_iters=2)), ('clf', OneVsRestClassifier(estimator=LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='sag',
          tol=0.0001, verbose=0, warm_start=False),
          n_jobs=None))]
Applying best classifier on test data:
              precision    recall  f1-score   support

           0       0.97      0.79      0.87        47
           1       0.62      0.94      0.74        17

   micro avg       0.83      0.83      0.83        64
   macro avg       0.79      0.86      0.81        64
weighted avg       0.88      0.83      0.84        64



In [39]:
pipeline = Pipeline([
    ('lsa', LsiTransformer(num_topics=2, id2word= dictionary)),
    ('clf', OneVsRestClassifier(DecisionTreeClassifier())),
])
parameters = {
    'lsa__num_topics': (2,3,4,5,6,7),
    "clf__estimator__criterion": ['gini', 'entropy'],
    "clf__estimator__splitter": ['best', 'random'],
    "clf__estimator__max_features": ['auto', 'sqrt', 'log2', None],
}

In [40]:
#train_x = map(int, train_x)
grid_search_tune = GridSearchCV(
    pipeline, parameters, cv=5, n_jobs=2, verbose=3)
grid_search_tune.fit(bow_train, y_train)

print
print("Best parameters set:")
print (grid_search_tune.best_estimator_.steps)
print

# measuring performance on test set
print ("Applying best classifier on test data:")
best_clf = grid_search_tune.best_estimator_
predictions = best_clf.predict(bow_test)

print (classification_report(y_test, predictions))

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:    3.0s
[Parallel(n_jobs=2)]: Done 124 tasks      | elapsed:   13.2s
[Parallel(n_jobs=2)]: Done 284 tasks      | elapsed:   30.1s
[Parallel(n_jobs=2)]: Done 480 out of 480 | elapsed:   50.7s finished


Best parameters set:
[('lsa', LsiTransformer(chunksize=20000, decay=1.0, extra_samples=100,
        id2word=<gensim.corpora.dictionary.Dictionary object at 0x00000207AF158B00>,
        num_topics=2, onepass=True, power_iters=2)), ('clf', OneVsRestClassifier(estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='random'),
          n_jobs=None))]
Applying best classifier on test data:
              precision    recall  f1-score   support

           0       0.92      0.98      0.95        47
           1       0.93      0.76      0.84        17

   micro avg       0.92      0.92      0.92        64
   macro avg       0.92      0.87      0.89        64
weighted avg       0.92      0.92      0.

In [41]:
pipeline = Pipeline([
    ('lsa', LsiTransformer(num_topics=2, id2word= dictionary)),
    ('clf', OneVsRestClassifier(KNeighborsClassifier())),
])
parameters = {
    'lsa__num_topics': (2,3,4,5,6,7),
    "clf__estimator__n_neighbors": (2,3,4,5,6,7),
    "clf__estimator__weights": ['uniform', 'distance'],
    "clf__estimator__algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute'],
}

In [42]:
#train_x = map(int, train_x)
grid_search_tune = GridSearchCV(
    pipeline, parameters, cv=5, n_jobs=2, verbose=3)
grid_search_tune.fit(bow_train, y_train)

print
print("Best parameters set:")
print (grid_search_tune.best_estimator_.steps)
print

# measuring performance on test set
print ("Applying best classifier on test data:")
best_clf = grid_search_tune.best_estimator_
predictions = best_clf.predict(bow_test)

print (classification_report(y_test, predictions))

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:    3.0s
[Parallel(n_jobs=2)]: Done 124 tasks      | elapsed:   13.3s
[Parallel(n_jobs=2)]: Done 284 tasks      | elapsed:   30.2s
[Parallel(n_jobs=2)]: Done 508 tasks      | elapsed:   54.0s
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:  1.4min
[Parallel(n_jobs=2)]: Done 1148 tasks      | elapsed:  2.0min
[Parallel(n_jobs=2)]: Done 1440 out of 1440 | elapsed:  2.6min finished


Best parameters set:
[('lsa', LsiTransformer(chunksize=20000, decay=1.0, extra_samples=100,
        id2word=<gensim.corpora.dictionary.Dictionary object at 0x00000207AFAF4898>,
        num_topics=7, onepass=True, power_iters=2)), ('clf', OneVsRestClassifier(estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=2, p=2,
           weights='distance'),
          n_jobs=None))]
Applying best classifier on test data:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        47
           1       0.94      0.94      0.94        17

   micro avg       0.97      0.97      0.97        64
   macro avg       0.96      0.96      0.96        64
weighted avg       0.97      0.97      0.97        64



In [43]:
pipeline = Pipeline([
    ('lsa', LsiTransformer(num_topics=2, id2word= dictionary)),
    ('clf', OneVsRestClassifier(RandomForestClassifier())),
])
parameters = {
    'lsa__num_topics': (2,3,4,5,6,7),
    "clf__estimator__criterion": ['gini', 'entropy'],
    "clf__estimator__max_features": ['auto', 'sqrt', 'log2', None],
}

In [44]:
#train_x = map(int, train_x)
grid_search_tune = GridSearchCV(
    pipeline, parameters, cv=5, n_jobs=2, verbose=3)
grid_search_tune.fit(bow_train, y_train)

print
print("Best parameters set:")
print (grid_search_tune.best_estimator_.steps)
print

# measuring performance on test set
print ("Applying best classifier on test data:")
best_clf = grid_search_tune.best_estimator_
predictions = best_clf.predict(bow_test)

print (classification_report(y_test, predictions))

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  52 tasks      | elapsed:    3.6s
[Parallel(n_jobs=2)]: Done 240 out of 240 | elapsed:   15.4s finished


Best parameters set:
[('lsa', LsiTransformer(chunksize=20000, decay=1.0, extra_samples=100,
        id2word=<gensim.corpora.dictionary.Dictionary object at 0x00000207ACFB04E0>,
        num_topics=5, onepass=True, power_iters=2)), ('clf', OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='log2', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          n_jobs=None))]
Applying best classifier on test data:
              precision    recall  f1-score   support

           0       0.94      0.98      0.96        47
           1       0.93      0.82      0.87        17

   micro avg       0.94      0.94      0.94        64
   macro avg      