In [2]:
!ls -1 ./data/task1/Book/

neg_Bk
pos_Bk


In [1]:
# Python 3.6

import os
import glob

import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.decomposition import TruncatedSVD
import xgboost as xgb

from src.pipeline_steps.nltk_word_tokenize import NLTKTokenizer
from src.pipeline_steps.to_lower_case import ToLowerCase
from src.pipeline_steps.remove_stop_words import RemoveStopWords
from src.pipeline_steps.keep_open_classes_only import KeepOpenClassesOnly
from src.pipeline_steps.sentiwordnet import SentiWordNetPosNegAttributes
from src.pipeline_steps.porter_stemmer import PorterStemmerStep
from src.pipeline_steps.data_shape_printer import ShapePrinter
from src.data_loading.task_1 import load_all_data_task_1
from src.text_classifier_pipelines.stop_words_open_class_stemmer.pipeline_factory import find_and_train_best_pipelines

[nltk_data] Downloading package stopwords to /home/gui/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package sentiwordnet to /home/gui/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


In [2]:
neg_Bk_files = glob.glob(os.path.join(".", "data", "task1", "Book", "neg_Bk", "*.text"))
pos_Bk_files = glob.glob(os.path.join(".", "data", "task1", "Book", "pos_Bk", "*.text"))

X_train, y_train, X_test, y_test = load_all_data_task_1(neg_Bk_files, pos_Bk_files)

print(len(X_train), len(y_train), len(X_test), len(y_test))

1600 1600 400 400


In [4]:
best_trained_pipelines = find_and_train_best_pipelines(X_train, y_train)

Will start Cross Validation for Logistic Classifiers.

Cross-Validation Grid Search for: 'NAMEE TODO'...
Best hyperparameters for 'NAMEE TODO' (3-folds cross validation accuracy score=0.694375):
{'count_vect_that_remove_unfrequent_words_and_stopwords__lowercase': False, 'count_vect_that_remove_unfrequent_words_and_stopwords__max_df': 0.98, 'count_vect_that_remove_unfrequent_words_and_stopwords__max_features': 50000, 'count_vect_that_remove_unfrequent_words_and_stopwords__min_df': 2, 'count_vect_that_remove_unfrequent_words_and_stopwords__ngram_range': (1, 1), 'count_vect_that_remove_unfrequent_words_and_stopwords__preprocessor': None, 'count_vect_that_remove_unfrequent_words_and_stopwords__strip_accents': None, 'count_vect_that_remove_unfrequent_words_and_stopwords__tokenizer': <function get_generic_hyperparams_grid.<locals>.<lambda> at 0x7ff104800840>, 'logistic_regression__C': 10000.0}



In [6]:
print("The final test: classifying on test documents of full-length:")
print("")
for (model_name, model) in best_trained_pipelines.items():
    score = model.score(X_test, y_test)
    print("Test set score for '{}': {}%".format(model_name, score*100))
    print("Test set was of 20% of full data, which was held-out of cross validation.")
print("")

The final test: classifying on test documents of full-length:

Test set score for '1-gram Char Logistic Classifier': 70.0%
Test set was of 20% of full data, which was held-out of cross validation.



In [4]:
pipeline = Pipeline([
    ('nltk_tokenizer', NLTKTokenizer()),
    ('to_lower_case', ToLowerCase()),
    ('remove_stop_words', RemoveStopWords()),
    ('keep_open_classes_only', KeepOpenClassesOnly()),
    ('sentiwordnet_attribute_pos_neg_count', SentiWordNetPosNegAttributes()),
    ('porter_stemmer', PorterStemmerStep()),
    ('count_vect_that_remove_unfrequent_words_and_stopwords', CountVectorizer()),
    ('logistic_regression', LogisticRegression()),
])

hyperparams_grid = {
    'count_vect_that_remove_unfrequent_words_and_stopwords__max_df': [0.98],
    'count_vect_that_remove_unfrequent_words_and_stopwords__min_df': [2],
    'count_vect_that_remove_unfrequent_words_and_stopwords__max_features': [50000],
    'count_vect_that_remove_unfrequent_words_and_stopwords__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'count_vect_that_remove_unfrequent_words_and_stopwords__strip_accents': [None],
    'count_vect_that_remove_unfrequent_words_and_stopwords__tokenizer': [lambda x: x],
    'count_vect_that_remove_unfrequent_words_and_stopwords__preprocessor': [None],
    'count_vect_that_remove_unfrequent_words_and_stopwords__lowercase': [False],
    'logistic_regression__C': [1e-2, 1.0, 1e2, 1e4]
}

grid_search = GridSearchCV(
    pipeline, hyperparams_grid, iid=True, cv=3, return_train_score=False, verbose=1, scoring="accuracy")
# TODO: increase CV to 5 such as:
# grid_search = GridSearchCV(pipeline, hyperparams_grid, iid=False, cv=5, return_train_score=False, verbose=1)
grid_search.fit(X, y)

print("Best hyperparameters (Cross Validation macro accuracy score=%0.3f):" % grid_search.best_score_)
best_params = grid_search.best_params_
print(best_params)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed: 11.2min finished


Best hyperparameters (Cross Validation macro accuracy score=0.777):
{'count_vect_that_remove_unfrequent_words_and_stopwords__lowercase': False, 'count_vect_that_remove_unfrequent_words_and_stopwords__max_df': 0.98, 'count_vect_that_remove_unfrequent_words_and_stopwords__max_features': 50000, 'count_vect_that_remove_unfrequent_words_and_stopwords__min_df': 2, 'count_vect_that_remove_unfrequent_words_and_stopwords__ngram_range': (1, 3), 'count_vect_that_remove_unfrequent_words_and_stopwords__preprocessor': None, 'count_vect_that_remove_unfrequent_words_and_stopwords__strip_accents': None, 'count_vect_that_remove_unfrequent_words_and_stopwords__tokenizer': <function <lambda> at 0x7f9b8c13bbf8>, 'logistic_regression__C': 1.0}


In [5]:
# Retrain best model: 

best_pipeline = Pipeline([
    ('nltk_tokenizer', NLTKTokenizer()),
    ('to_lower_case', ToLowerCase()),
    ('remove_stop_words', RemoveStopWords()),
    ('keep_open_classes_only', KeepOpenClassesOnly()),
    ('sentiwordnet_attribute_pos_neg_count', SentiWordNetPosNegAttributes()),
    ('porter_stemmer', PorterStemmerStep()),
    ('count_vect_that_remove_unfrequent_words_and_stopwords', CountVectorizer()),
    # ('shapr', ShapePrinter("shapr")),
    ('logistic_regression', LogisticRegression()),
])
best_pipeline.set_params(
    **best_params
)
scores = cross_val_score(best_pipeline, X, y, cv=5, scoring='accuracy')
print(scores)

[0.8025 0.8075 0.7875 0.7875 0.7625]


In [6]:
best_pipeline.fit(X, y)
best_pipeline.score(X, y)

0.9995

In [7]:
print(((best_pipeline.predict(X) == y)*1.0).mean())

0.9995


In [375]:

new_params = {
    'count_vect_that_remove_unfrequent_words_and_stopwords__max_df': 0.98,
    'count_vect_that_remove_unfrequent_words_and_stopwords__min_df': 2,
    'count_vect_that_remove_unfrequent_words_and_stopwords__max_features': 50000,
    'count_vect_that_remove_unfrequent_words_and_stopwords__ngram_range': (1, 2),
    'count_vect_that_remove_unfrequent_words_and_stopwords__strip_accents': None,
    'count_vect_that_remove_unfrequent_words_and_stopwords__tokenizer': lambda x: x,
    'count_vect_that_remove_unfrequent_words_and_stopwords__preprocessor': None,
    'count_vect_that_remove_unfrequent_words_and_stopwords__lowercase': False,
    #'tsvd__n_components': 1024,
    "xgb__max_depth": 20,
    "xgb__n_estimators": 20,
    "xgb__learning_rate": 1
}
xgb_pipeline = Pipeline([
    ('nltk_tokenizer', NLTKTokenizer()),
    ('to_lower_case', ToLowerCase()),
    ('remove_stop_words', RemoveStopWords()),
    ('keep_open_classes_only', KeepOpenClassesOnly()),
    ('porter_stemmer', PorterStemmerStep()),
    ('count_vect_that_remove_unfrequent_words_and_stopwords', CountVectorizer()),
    # ('shapr', ShapePrinter("shapr")),
    #('tsvd', TruncatedSVD()),
    ('xgb', xgb.XGBClassifier()),
])
xgb_pipeline.set_params(
    **new_params
)
# xgb_pipeline.fit(X, y)
# print(((xgb_pipeline.predict(X) == y)*1.0).mean())
scores = cross_val_score(xgb_pipeline, X, y, cv=5, scoring='accuracy')
print(scores)

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


array([0.735 , 0.71  , 0.7375, 0.7175, 0.6975])

In [245]:
# Sélection d’attributs: Tous les attributs, avec classes ouvertes seulement, sans les mots outils:
# Ne garder que les mots appartenant à des classes ouvertes (c.-à-d. les noms,
# adjectifs, verbes et adverbes). Vous devez faire une analyse grammaticale (POS 
# tagging) des textes pour identifier ces mots. 

# Stemming. Optionnel: aussi WordNetLemmatizer de NLTK.

# Valeurs d’attributs: Compte de mots: TF. Optionnel:  présence et tf-idf

# Autres attributs: Nombre de mots positifs/négatifs 
# (aka Le nombre de mots dont la polarité est positive ou négative. Vous pouvez utiliser SentiWordnet (NLTK) ou un autre lexique pour estimer cet attribut.)

# Naive bayes, régression logistique



In [8]:
d = {
    'logistic__alpha': np.logspace(-4, 4, 9),
}
np.logspace(-4, 4, 9).tolist()

[0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]

In [124]:
breakdown = swn.senti_synset('the')
print(breakdown)


ValueError: not enough values to unpack (expected 3, got 1)

In [175]:



document = ["joy", "not", "anger"]



['swnsentinegative', 'swnsentinegative', 'swnsentipositive', 'joy', 'not', 'anger']


In [15]:
r = swn.all_senti_synsets()

In [104]:

r = swn.senti_synsets('happy')

In [106]:
dir(r)

['__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__next__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__']