In [364]:
# Python 3

import os
import glob

import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import TruncatedSVD
import xgboost as xgb


In [365]:
!ls -1 ./data/task1/Book/
# out: 
# neg_Bk
# pos_Bk

neg_Bk
pos_Bk


In [366]:


neg_Bk_files = glob.glob(os.path.join(".", "data", "task1", "Book", "neg_Bk", "*.text"))
pos_Bk_files = glob.glob(os.path.join(".", "data", "task1", "Book", "pos_Bk", "*.text"))


def load_file(file_path):
    with open(file_path, 'r', encoding="ISO-8859-1") as fml:
        crushed_content = fml.read().rstrip("\n").strip()
    return crushed_content

def load_all_files(file_paths):
    return [load_file(path) for path in file_paths]

def load_all_data_task1(pos_files_paths, neg_files_paths):

    pos = load_all_files(pos_files_paths)
    neg = load_all_files(neg_files_paths)

    X = pos + neg
    y = [1] * len(pos) + [0] * len(neg)
    
    # Have non-concentrated classes, but with seed for reproducibility:
    X, y = sklearn.utils.shuffle(X, y, random_state=42)
    
    return X, y

X, y = load_all_data_task1(neg_Bk_files, pos_Bk_files)

print(len(X), len(y))

2000 2000


In [367]:
class NLTKTokenizer(BaseEstimator, TransformerMixin):
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x, y=None):
        return [word_tokenize(sentence) for sentence in x]

In [368]:
class ToLowerCase(BaseEstimator, TransformerMixin):
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x, y=None):
        return [[word.lower() for word in sentence] for sentence in x]

In [369]:
nltk.download('stopwords')


class RemoveStopWords(BaseEstimator, TransformerMixin):
    
    def __init__(self, **k): 
        self.stop_words = [
            sw.lower() for sw in (
                set(sklearn.feature_extraction.stop_words.ENGLISH_STOP_WORDS) &
                set(stopwords.words('english'))
            )
        ]
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x, y=None):
        return [[word for word in sentence if word.lower() not in self.stop_words] for sentence in x]


[nltk_data] Downloading package stopwords to /home/gui/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [370]:
class KeepOpenClassesOnly(BaseEstimator, TransformerMixin): 
    """
    Categories that will usually be open classes:
    - adjectives
    - adverbs
    - nouns
    - verbs (except auxiliary verbs)
    - interjections
    
    Keep Open Classes or things that contains at least some of Open Classes:
    - adjectives:
    JJ: adjective or numeral, ordinal
    JJR: adjective, comparative
    JJS: adjective, superlative
    - adverbs:
    RB: adverb
    RBR: adverb, comparative
    RBS: adverb, superlative
    WRB: Wh-adverb
    - nouns:
    NN: noun, common, singular or mass
    NNP: noun, proper, singular
    NNPS: noun, proper, plural
    NNS: noun, common, plural
    - verbs (except auxiliary verbs):
    VB: verb, base form
    VBD: verb, past tense
    VBG: verb, present participle or gerund
    VBN: verb, past participle
    VBP: verb, present tense, not 3rd person singular
    VBZ: verb, present tense, 3rd person singular
    - interjections:
    UH: interjection

    Other words that are not in "Open Classes": 
    CC: conjunction, coordinating
    CD: numeral, cardinal
    DT: determiner
    EX: existential there
    FW: foreign word
    IN: preposition or conjunction, subordinating
    LS: list item marker
    MD: modal auxiliary
    PDT: pre-determiner
    POS: genitive marker
    PRP: pronoun, personal
    PRP$: pronoun, possessive
    RP: particle
    SYM: symbol
    TO: "to" as preposition or infinitive marker
    WDT: WH-determiner
    WP: WH-pronoun
    WP$: WH-pronoun, possessive

    Other that are kept, because they are not "words",
    but rather misc things we might want to keep: 
    $: dollar
    '': closing quotation mark
    ``: opening quotation mark
    (: opening parenthesis
    ): closing parenthesis
    ,: comma
    --: dash
    .: sentence terminator
    :: colon or ellipsis

    References: 
    - https://en.wikipedia.org/wiki/Part_of_speech#Functional_classification
    - https://www.nltk.org/book/ch05.html
    """
    WORDS_OPEN_CLASSES = ["JJ", "JJR", "JJS", "RB", "RBR", "RBS", "WRB", "NN", "NNP", "NNPS", "NNS", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "UH"]
    WORDS_CLOSED_CLASSES_OR_OTHER_MISC = ["CC", "CD", "DT", "EX", "FW", "IN", "LS", "MD", "PDT", "POS", "PRP", "PRP", "RP", "SYM", "TO", "WDT", "WP", "WP"]
    OTHER_NOT_WORDS = ["$", "''", "``", "(", ")", ",", "--", ".", ":"]
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x, y=None):
        self.all_tags = set()
        # TODO: do.
        return x

    def _tag_all_filter(self, X):
        filtered = [_tag_filter(x) for x in X[:10]]
        return filtered

    def _tag_filter(self, x): 
        tagged = nltk.pos_tag(word_tokenize(x))
        
        for (word, tag) in tagged: # TODO: del this.
            self.all_tags.add(tag)
        
        WORDS_TO_KEEP = OpenClassesOnly.WORDS_OPEN_CLASSES + OpenClassesOnly.OTHER_NOT_WORDS
        
        return [word for (word, tag) in tagged if tag in WORDS_TO_KEEP]
    
    def print_more_info(self):
        """
        Print the definition of all NLTK's tags:
        """
        nltk.download('tagsets')
        nltk.help.upenn_tagset()
        print("WORDS_OPEN_CLASSES: ", WORDS_OPEN_CLASSES)
        print("WORDS_CLOSED_CLASSES_OR_OTHER_MISC: ", WORDS_CLOSED_CLASSES_OR_OTHER_MISC)
        print("OTHER_NOT_WORDS: ", OTHER_NOT_WORDS)
        print("WORDS_TO_KEEP = WORDS_OPEN_CLASSES + OTHER_NOT_WORDS")


In [371]:
class WrappedPorterStemmer(BaseEstimator, TransformerMixin):
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x, y=None):
        stemmer = PorterStemmer()
        
        transformed_x = []
        for tokenized_sample in x: 
            transformed_sample = []
            for word in tokenized_sample: 
                if type(word) != str:
                    print(type(word), len(word))
                transformed_sample.append(
                    stemmer.stem(word)
                )
            transformed_x.append(transformed_sample)
                
        return transformed_x

# s = "bacon - I like that, ouch wow damn hey bye okay oh m-hm huh"
# WrappedPorterStemmer().fit_transform(X=NLTKTokenizer().fit_transform([s])[0])
"""
stemmer = PorterStemmer()
new_text = word_tokenize(s)
for w in new_text:
    print(stemmer.stem(w))
"""
pass

In [372]:
class ShapePrinter(BaseEstimator, TransformerMixin):
    
    def __init__(self, name, **params):
        self.name = name
    
    def fit(self, x, y=None):
        print("FIT:", self.name, x.shape)
        return self
    
    def transform(self, x, y=None):
        print("TRANSFORM:", self.name, x.shape)
        return x

In [373]:
pipeline = Pipeline([
    ('nltk_tokenizer', NLTKTokenizer()),
    ('to_lower_case', ToLowerCase()),
    ('remove_stop_words', RemoveStopWords()),
    ('keep_open_classes_only', KeepOpenClassesOnly()),
    ('porter_stemmer', WrappedPorterStemmer()),
    ('count_vect_that_remove_unfrequent_words_and_stopwords', CountVectorizer()),
    ('logistic_regression', LogisticRegression()),
])

hyperparams_grid = {
    'count_vect_that_remove_unfrequent_words_and_stopwords__max_df': [0.98],
    'count_vect_that_remove_unfrequent_words_and_stopwords__min_df': [2],
    'count_vect_that_remove_unfrequent_words_and_stopwords__max_features': [10000],
    'count_vect_that_remove_unfrequent_words_and_stopwords__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'count_vect_that_remove_unfrequent_words_and_stopwords__strip_accents': [None],
    'count_vect_that_remove_unfrequent_words_and_stopwords__tokenizer': [lambda x: x],
    'count_vect_that_remove_unfrequent_words_and_stopwords__preprocessor': [None],
    'count_vect_that_remove_unfrequent_words_and_stopwords__lowercase': [False],
    'logistic_regression__C': [1e4, 1e5],
    'logistic_regression__solver': ['lbfgs']
    # 'logistic_regression__alpha': np.logspace(-4, 4, 5),
    # 'logistic_regression__multi_class': 2
}
_ = """
hyperparams_grid = {
    'count_vect_that_remove_unfrequent_words_and_stopwords__max_df': [0.98],
    'count_vect_that_remove_unfrequent_words_and_stopwords__min_df': [2],
    'count_vect_that_remove_unfrequent_words_and_stopwords__max_features': [10000],
    'count_vect_that_remove_unfrequent_words_and_stopwords__ngram_range': [(1, 1)],
    'count_vect_that_remove_unfrequent_words_and_stopwords__strip_accents': [None],
    'count_vect_that_remove_unfrequent_words_and_stopwords__tokenizer': [lambda x: x],
    'count_vect_that_remove_unfrequent_words_and_stopwords__preprocessor': [None],
    'count_vect_that_remove_unfrequent_words_and_stopwords__lowercase': [False],
    'logistic_regression__C': [1e4],
    'logistic_regression__solver': ['lbfgs']
    # 'logistic_regression__alpha': np.logspace(-4, 4, 5),
    # 'logistic_regression__multi_class': 2
}
"""

grid_search = GridSearchCV(
    pipeline, hyperparams_grid, iid=False, cv=3, return_train_score=False, verbose=1, scoring="accuracy")
# TODO: increase CV to 5 such as:
# grid_search = GridSearchCV(pipeline, hyperparams_grid, iid=False, cv=5, return_train_score=False, verbose=1)
grid_search.fit(X, y)

print("Best hyperparameters (Cross Validation macro accuracy score=%0.3f):" % grid_search.best_score_)
best_params = grid_search.best_params_
print(best_params)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:  2.4min finished


Best hyperparameters (Cross Validation macro accuracy score=0.745):
{'count_vect_that_remove_unfrequent_words_and_stopwords__lowercase': False, 'count_vect_that_remove_unfrequent_words_and_stopwords__max_df': 0.98, 'count_vect_that_remove_unfrequent_words_and_stopwords__max_features': 10000, 'count_vect_that_remove_unfrequent_words_and_stopwords__min_df': 2, 'count_vect_that_remove_unfrequent_words_and_stopwords__ngram_range': (1, 2), 'count_vect_that_remove_unfrequent_words_and_stopwords__preprocessor': None, 'count_vect_that_remove_unfrequent_words_and_stopwords__strip_accents': None, 'count_vect_that_remove_unfrequent_words_and_stopwords__tokenizer': <function <lambda> at 0x7fa5820e12f0>, 'logistic_regression__C': 100000.0, 'logistic_regression__solver': 'lbfgs'}


In [376]:
# Retrain best model: 

best_pipeline = Pipeline([
    ('nltk_tokenizer', NLTKTokenizer()),
    ('to_lower_case', ToLowerCase()),
    ('remove_stop_words', RemoveStopWords()),
    ('keep_open_classes_only', KeepOpenClassesOnly()),
    ('porter_stemmer', WrappedPorterStemmer()),
    ('count_vect_that_remove_unfrequent_words_and_stopwords', CountVectorizer()),
    # ('shapr', ShapePrinter("shapr")),
    ('logistic_regression', LogisticRegression()),
])
best_pipeline.set_params(
    **best_params
)
# best_pipeline.fit(X, y)
# print(((best_pipeline.predict(X) == y)*1.0).mean())
scores = cross_val_score(best_pipeline, X, y, cv=5, scoring='accuracy')
print(scores)

[0.7375 0.78   0.7525 0.7375 0.7525]


In [375]:

new_params = {
    'count_vect_that_remove_unfrequent_words_and_stopwords__max_df': 0.98,
    'count_vect_that_remove_unfrequent_words_and_stopwords__min_df': 2,
    'count_vect_that_remove_unfrequent_words_and_stopwords__max_features': 10000,
    'count_vect_that_remove_unfrequent_words_and_stopwords__ngram_range': (1, 2),
    'count_vect_that_remove_unfrequent_words_and_stopwords__strip_accents': None,
    'count_vect_that_remove_unfrequent_words_and_stopwords__tokenizer': lambda x: x,
    'count_vect_that_remove_unfrequent_words_and_stopwords__preprocessor': None,
    'count_vect_that_remove_unfrequent_words_and_stopwords__lowercase': False,
    #'tsvd__n_components': 1024,
    "xgb__max_depth": 20,
    "xgb__n_estimators": 20,
    "xgb__learning_rate": 1
}
xgb_pipeline = Pipeline([
    ('nltk_tokenizer', NLTKTokenizer()),
    ('to_lower_case', ToLowerCase()),
    ('remove_stop_words', RemoveStopWords()),
    ('keep_open_classes_only', KeepOpenClassesOnly()),
    ('porter_stemmer', WrappedPorterStemmer()),
    ('count_vect_that_remove_unfrequent_words_and_stopwords', CountVectorizer()),
    # ('shapr', ShapePrinter("shapr")),
    #('tsvd', TruncatedSVD()),
    ('xgb', xgb.XGBClassifier()),
])
xgb_pipeline.set_params(
    **new_params
)
# xgb_pipeline.fit(X, y)
# print(((xgb_pipeline.predict(X) == y)*1.0).mean())
scores = cross_val_score(xgb_pipeline, X, y, cv=5, scoring='accuracy')
print(scores)

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


array([0.735 , 0.71  , 0.7375, 0.7175, 0.6975])

In [245]:
# Sélection d’attributs: Tous les attributs, avec classes ouvertes seulement, sans les mots outils:
# Ne garder que les mots appartenant à des classes ouvertes (c.-à-d. les noms,
# adjectifs, verbes et adverbes). Vous devez faire une analyse grammaticale (POS 
# tagging) des textes pour identifier ces mots. 

# Stemming. Optionnel: aussi WordNetLemmatizer de NLTK.

# Valeurs d’attributs: Compte de mots: TF. Optionnel:  présence et tf-idf

# Autres attributs: Nombre de mots positifs/négatifs 
# (aka Le nombre de mots dont la polarité est positive ou négative. Vous pouvez utiliser SentiWordnet (NLTK) ou un autre lexique pour estimer cet attribut.)

# Naive bayes, régression logistique



In [None]:
d = {
    'logistic__alpha': np.logspace(-4, 4, 9),
}
np.logspace(-4, 4, 9).tolist()