**Dataset is loaded**

In [None]:
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))

list_document = dataset.data

len(list_document), type(list_document)

**LDA model is created**

Aims is to discover topics from inside document corpus.

For doing so :
 * We compute text frequency to feed LDA model
 * We fixe the number of expected topics with hyper-parameter ``no_topics``

In [None]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

#tokenizer = nltk.RegexpTokenizer(r'[ a-zA-Z0-9]')
#tokenized_list = tokenizer.tokenize(item.lower())
#--------------------------------------------------------------------
# Get text frequency (TF)
#--------------------------------------------------------------------
tf_vectorizer=CountVectorizer(max_features=1000, stop_words='english',ngram_range=(1,2))
tf_csr_matrix = tf_vectorizer.fit_transform(list_document)

In [None]:
import re

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

no_topics = 15

# Run LDA
lda_model = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online'\
                                      , learning_offset=50.,random_state=0).fit(tf_csr_matrix)


**Displays topics recovered from LDA model considering top 10 more frequents words**

In [None]:
def display_topics(model, feature_names, nb_top_words):
    #---------------------------------------------------------------
    # model.components_ : matrix dimension= (N_topics x K_features)
    #---------------------------------------------------------------
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        #-----------------------------------------------------------
        # topic.argsort() returns indices that are sorting array
        #-----------------------------------------------------------
        print(" ".join([feature_names[i] for i in topic.argsort()[:-nb_top_words - 1:-1]]))
        print()

nb_top_words = 10
display_topics(lda_model, tf_vectorizer.get_feature_names(), nb_top_words)


Apply LDA again over topics

In [None]:
#----------------------------------------------------------------
# Get features names from which topics are composed from
#----------------------------------------------------------------
list_feature_name = tf_vectorizer.get_feature_names()

#----------------------------------------------------------------
# This list will contain topics regarded as documents
#----------------------------------------------------------------
list_topic_document=list()

#----------------------------------------------------------------
# For each topics, only most important features names are captured.
# This will lead of a new kind of documents: documents of topics.
#----------------------------------------------------------------
most_important_values=100
for topic in lda_model.components_:
    #--------------------------------------------------------------------------
    # topic.argsort() returns an index array of array values sorted ascendend.
    #--------------------------------------------------------------------------
    index_array=topic.argsort()

    #--------------------------------------------------------------------------
    # Each topic is converted as a document
    # 'most_important_values' are most important values exracted from features names: 
    # they are found from the last values of sorted array.
    #--------------------------------------------------------------------------
    topic_document =' '.join([list_feature_name[i] for i in index_array[:-most_important_values - 1:-1]])
    list_topic_document.append(topic_document)

In [None]:
#--------------------------------------------------------------------
# Get text frequency (TF)
#--------------------------------------------------------------------
tf_vectorizer=CountVectorizer(max_features=10, stop_words='english',ngram_range=(1,1))
tf_csr_matrix = tf_vectorizer.fit_transform(list_topic_document)

In [None]:
no_topics_topics = 3

# Run LDA
lda_model_topics = LatentDirichletAllocation(n_components=no_topics_topics, max_iter=5, learning_method='online'\
                                      , learning_offset=50.,random_state=0).fit(tf_csr_matrix)

In [None]:
nb_top_words = 10
display_topics(lda_model_topics, tf_vectorizer.get_feature_names(), nb_top_words)


# Quiz 1

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
dict_corpus={1:"Je suis à la maison", 2:"La maison est dans la prairie", 3:"Je suis à la plage"}
list_corpus = list(dict_corpus.values())
print("\n".join(list_corpus))
cnt_vectorizer = CountVectorizer()
print(cnt_vectorizer.fit_transform(list_corpus).todense())
print(cnt_vectorizer.vocabulary_)

In [None]:
vocabulary =['je','suis','à','la','maison','est','dans','prairie','plage']
            [  1,    1,    1,  1,     1,     0,     0,      0,       0]
            [  0,    0,    0,  1,     1,     1,     1,      1,       0]
            [  1,    1,    1,  1,     0,     0,     0,      0,       1]
[0,0,0,0,0,0,0,0,0]

In [None]:
import p6_util_activity


tfidf_vectorizer= p6_util_activity.get_tfidf_vectorizer(dict_corpus, doc_type='string')
dict_tfidf = p6_util_activity.get_dict_tfidf(tfidf_vectorizer)

In [None]:
dict_tfidf
list_token = list()
for doc in dict_corpus.values():
    list_token += doc.split()
list_token
tfidf_vectorizer.transform(dict_corpus.values()).todense()
tfidf_vectorizer.vocabulary_
tfidf_vectorizer.idf_


In [None]:
tfidf_vectorizer.transform(dict_corpus.values()).todense()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_vectorizer=TfidfVectorizer(norm="l2", use_idf=False)


tf_vectorizer = tf_vectorizer.fit(dict_corpus.values())

In [None]:
csr_matrix = tf_vectorizer.transform(dict_corpus.values())

In [None]:
from nltk.util import ngrams
list_text = "La seconde partie de du cours de traitement de texte traite de la transformation des données textuelles".split()
#list_text = ["ABCD","123","GHT","KKK"]
list_ngram = list()
for ngram in ngrams(list_text,3):
    joined_ngram= ' '.join(str(i) for i in ngram)
    list_ngram.append(joined_ngram)

len(list_ngram)    

# Opérez une première classification naïve de sentiments

* Imdb dataset has been downloaded from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

In [2]:
import nltk
import os

#---------------------------------------------------------------------
# 
#---------------------------------------------------------------------
def feature_extract(content):
    """Returns a dictionary in which : 
    * keys are tokens from content given as parameter.
    * values are always True.
    Content is tokenized using NLTK.
    Each content from file is encoded as a dictionary: 
    {token1:True, token2:True,...,tokenK:True}
    """
    return ({ word: True for word in nltk.word_tokenize(content) })
#---------------------------------------------------------------------

#---------------------------------------------------------------------
# 
#---------------------------------------------------------------------
def load_data(relative_path):
    """Returns a list of encoded features.
    Each element of list is an encoded content for each file.
    An encoded content from a file has the folowwing format :
    [{token1:True,...,tokenN:True}, tag_value]
    tag_value may be 'pos' or 'neg'.
    
    """
    list_data = list()
    tag='pos'
    print("Reading "+tag+" tag files....")
    file_count=0
    for file_name in os.listdir(relative_path+'/'+tag):
        file_name = relative_path+'/'+tag+'/'+file_name
        with open(file_name) as fp:
            for content in fp:
                dict_feature = feature_extract(content)
                list_data.append([dict_feature,tag])
        file_count += 1
    print("Number of "+tag+" files read: "+str(file_count))
    
    print("")
    
    tag='neg'
    print("Reading "+tag+" tag files....")
    file_count=0
    for file_name in os.listdir(relative_path+'/'+tag):
        file_name = relative_path+'/'+tag+'/'+file_name
        with open(file_name) as fp:
            for content in fp:
                dict_feature = feature_extract(content)
                list_data.append([dict_feature,tag])
        file_count += 1
    print("Number of "+tag+" files read: "+str(file_count))

    return list_data
#---------------------------------------------------------------------


**Loading training set**

In [3]:
relative_path = './data/aclImdb/train'
list_train = load_data(relative_path)

Reading pos tag files....
Number of pos files read: 12500

Reading neg tag files....
Number of neg files read: 12500


list_train is compounded from lists as follwing : 

list_train=[list_1, list_2,...,list_N]

and 

list_1=[dict_encoded_value,category_value]

In [24]:
list_1=list_train[0]
dict_encoded_value_1 = list_1[0]
category_value=list_1[1]
category_value, dict_encoded_value_1

('pos',
 {'!': True,
  '&': True,
  "'m": True,
  "'s": True,
  '(': True,
  ')': True,
  ',': True,
  '.': True,
  '...': True,
  '4': True,
  'A': True,
  'Appolonia': True,
  'B': True,
  'Baby': True,
  'Beautiful': True,
  'Blue': True,
  'Computer': True,
  'Crazy': True,
  'Cry': True,
  'Darling': True,
  'Die': True,
  'Doves': True,
  'Go': True,
  'I': True,
  'It': True,
  'Let': True,
  'Me': True,
  'Nikki': True,
  'Ones': True,
  'Purple': True,
  'R': True,
  'Rain': True,
  'Star': True,
  'Take': True,
  'The': True,
  'U': True,
  'When': True,
  'With': True,
  'Would': True,
  'a': True,
  'album': True,
  'all': True,
  'and': True,
  'angry': True,
  'anthem': True,
  'appropriate': True,
  'are': True,
  'as': True,
  'ballad': True,
  'beginning': True,
  'best': True,
  'both': True,
  'cheerful': True,
  'classic': True,
  'climax': True,
  'closest': True,
  'course': True,
  'ending': True,
  'ever': True,
  'for': True,
  'fun': True,
  'funniest': True,


* Distributions
*        +----------+-----------+
*        | pos      |  neg      |
* -------+----------+-----------+-------------+
  token1 |    N11   |    N12    | P(C |token1)| Probability token1 belongs to category C
  -------+----------+-----------+-------------+   
  token2 |    N21   |    N22    | P(C |token2)| Probability token2 belongs to category C
  -------+----------+-----------+-------------+
  token3 |    N31   |    N32    | P(C |token3)| Probability token3 belongs to category C
  -------+----------+-----------+-------------+
            . . .   
  -------+----------+-----------+-------------+
  tokenK |    NK1   |    NK2    | P(C |tokenK)| Probability tokenK belongs to category C
  -------+----------+-----------+-------------+
         | P(X|pos) |  P(X|neg) |
         +----------+-----------+
"""         

**Training classifier**

In [4]:
from nltk.classify import NaiveBayesClassifier

classifier = NaiveBayesClassifier.train(list_train)

**Display weights assigned to any feature, mean, P(feature|C), this, for any category C.**

In [5]:
classifier.show_most_informative_features(n=10)

Most Informative Features
                   Avoid = True              neg : pos    =     93.4 : 1.0
                    2/10 = True              neg : pos    =     75.7 : 1.0
                    4/10 = True              neg : pos    =     64.2 : 1.0
                    *1/2 = True              neg : pos    =     57.0 : 1.0
                    3/10 = True              neg : pos    =     43.6 : 1.0
                    Boll = True              neg : pos    =     37.7 : 1.0
                     Uwe = True              neg : pos    =     36.3 : 1.0
                    7/10 = True              pos : neg    =     33.2 : 1.0
                   WORST = True              neg : pos    =     27.8 : 1.0
                    8/10 = True              pos : neg    =     27.5 : 1.0


**Loading test set**

In [6]:
relative_path = './data/aclImdb/test'
list_test = load_data(relative_path)

Reading pos tag files....
Number of pos files read: 12500

Reading neg tag files....
Number of neg files read: 12500


**Model performance evaluation from accuracy**

In [7]:
print(nltk.classify.accuracy(classifier, list_test)) 

0.83076


# Exemple from site 

* http://www.nltk.org/book/ch06.html

This corpus handles list of names splited into 2 categoies : male and female.

* Male names are provided as a list of names.
* Female names are provided as a list of names.
* ``list_labeled_names`` is a list built while assigning a category for each name: [(name, category),...]

In [8]:
from nltk.corpus import names
list_labeled_names = (  [(name, 'male')   for name in names.words('male.txt')] \
                 + [(name, 'female') for name in names.words('female.txt')])


Extracted features are shuffled 

In [9]:
import random
random.shuffle(list_labeled_names)


list_labeled_names[len(list_labeled_names)-1],type(list_labeled_names)

(('Kerri', 'female'), list)

In [10]:
# What are the relevant features to classify gender? How to encode them?

# Let's consider that relevant features for gender name clasification is the last letter in the gender name.
# Releveant features extraction lead to format features as following : {'last_letter':value}
# where value is the last letter from a name.

def gender_features_exract(word):
    """Returns extracted feature from word.
    In this fuction, the last letter gender name is regarded as the feature gender name.
    Extracted feature is returned as a dictionary {'last_letter': last_letter_value}
    Then this lead to create the column 'last_letter' in the data model to be built.
    """
    return {'last_letter': word[-1]}

def gender_features_exract2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features

In [11]:
gender_features_exract2('François')

{'count(a)': 1,
 'count(b)': 0,
 'count(c)': 0,
 'count(d)': 0,
 'count(e)': 0,
 'count(f)': 1,
 'count(g)': 0,
 'count(h)': 0,
 'count(i)': 1,
 'count(j)': 0,
 'count(k)': 0,
 'count(l)': 0,
 'count(m)': 0,
 'count(n)': 1,
 'count(o)': 1,
 'count(p)': 0,
 'count(q)': 0,
 'count(r)': 1,
 'count(s)': 1,
 'count(t)': 0,
 'count(u)': 0,
 'count(v)': 0,
 'count(w)': 0,
 'count(x)': 0,
 'count(y)': 0,
 'count(z)': 0,
 'first_letter': 'f',
 'has(a)': True,
 'has(b)': False,
 'has(c)': False,
 'has(d)': False,
 'has(e)': False,
 'has(f)': True,
 'has(g)': False,
 'has(h)': False,
 'has(i)': True,
 'has(j)': False,
 'has(k)': False,
 'has(l)': False,
 'has(m)': False,
 'has(n)': True,
 'has(o)': True,
 'has(p)': False,
 'has(q)': False,
 'has(r)': True,
 'has(s)': True,
 'has(t)': False,
 'has(u)': False,
 'has(v)': False,
 'has(w)': False,
 'has(x)': False,
 'has(y)': False,
 'has(z)': False,
 'last_letter': 's'}

* Building data model will lead to build contingence matrix as following: 



---------------------------------------
              Female      |     Male 
---------------------------------------
last_letter               

---------------------------------------
       s              N00      NO1
---------------------------------------
       e              N10      N11
                .....  
---------------------------------------
       t              Nj0      Nj1
---------------------------------------
       a              Nn0      Nn1
            

In [12]:
list_featuresets = [(gender_features_exract(n), gender) for (n, gender) in list_labeled_names]

In [13]:
train_set, test_set = list_featuresets[500:], list_featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [14]:
print(nltk.classify.accuracy(classifier, test_set))

0.798


In [15]:
classifier.labels()

['female', 'male']

In [17]:
list_featuresets2 = [(gender_features_exract2(n), gender) for (n, gender) in list_labeled_names]

In [18]:
train_set2, test_set2 = list_featuresets2[500:], list_featuresets2[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set2)

In [19]:
classifier.labels()
classifier.show_most_informative_features(n=10)

Most Informative Features
             last_letter = 'a'            female : male   =     38.6 : 1.0
             last_letter = 'k'              male : female =     30.8 : 1.0
             last_letter = 'f'              male : female =     16.0 : 1.0
             last_letter = 'p'              male : female =     11.9 : 1.0
             last_letter = 'v'              male : female =     10.5 : 1.0
             last_letter = 'd'              male : female =     10.3 : 1.0
             last_letter = 'o'              male : female =      8.6 : 1.0
             last_letter = 'm'              male : female =      8.5 : 1.0
             last_letter = 'r'              male : female =      6.3 : 1.0
             last_letter = 'g'              male : female =      5.6 : 1.0
