In [2]:
"""
1er Modèle

Author : bsanchez@starclay.fr
date : 20/07/2020
"""

import sys
sys.path.append("..")
import os
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import numpy as np


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn import tree
from sklearn.tree import export_graphviz

import matplotlib.pyplot as plt
import scikitplot as skplt
plt.rcParams["figure.figsize"] = (20,10)

np.random.seed(42)

In [1]:
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

# data visualisation and manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
#configure
# sets matplotlib to inline and displays graphs below the corressponding cell.
%matplotlib inline  
style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)

#nltk
import nltk

#stop-words
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
stop_words=set(nltk.corpus.stopwords.words('english'))

# tokenizing
from nltk import word_tokenize,sent_tokenize

#keras
import keras
from keras.preprocessing.text import one_hot,Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense , Flatten ,Embedding,Input
from keras.models import Model

ModuleNotFoundError: No module named 'nltk'

In [2]:
sample_text_1 = "groupe pomona"
sample_text_2 = "education national"
sample_text_3 = "gendarmerie national"
sample_text_4 = "manpower"
sample_text_5 = "Hopital teunon paris"

In [3]:
corp=[sample_text_1,sample_text_2,sample_text_3,sample_text_4,sample_text_5]
no_docs=len(corp)

In [4]:
corp

['groupe pomona',
 'education national',
 'gendarmerie national',
 'manpower',
 'Hopital teunon paris']

In [5]:
vocab_size=50 
encod_corp=[]
for i,doc in enumerate(corp):
    encod_corp.append(one_hot(doc,50))
    print("The encoding for document",i+1," is : ", one_hot(doc,50))

The encoding for document 1  is :  [7, 5]
The encoding for document 2  is :  [10, 27]
The encoding for document 3  is :  [20, 27]
The encoding for document 4  is :  [33]
The encoding for document 5  is :  [9, 13, 24]


In [6]:
encod_corp

[[7, 5], [10, 27], [20, 27], [33], [9, 13, 24]]

In [7]:
maxlen = -1
for doc in corp:
    tokens=nltk.word_tokenize(doc)
    if(maxlen<len(tokens)):
        maxlen=len(tokens)
print("The maximum number of words in any document is : ", maxlen)

The maximum number of words in any document is :  3


In [103]:
import pandas as pd
a = pd.DataFrame([[["a","a"],["a"]],[["a","a"],["a"]],[["a","a"],["a"]]],columns=["za","ba"])
# z = [a[col] for col in a]
np.asarray(a["za"].values.tolist()).shape

(3, 2)

In [9]:
pad_corp = pad_sequences(encod_corp,maxlen=maxlen,padding='post',value=0.0)
pad_corp

array([[ 7,  5,  0],
       [10, 27,  0],
       [20, 27,  0],
       [33,  0,  0],
       [ 9, 13, 24]], dtype=int32)

In [10]:
for i,doc in enumerate(pad_corp):
     print("The padded encoding for document",i+1," is : ",doc)

The padded encoding for document 1  is :  [7 5 0]
The padded encoding for document 2  is :  [10 27  0]
The padded encoding for document 3  is :  [20 27  0]
The padded encoding for document 4  is :  [33  0  0]
The padded encoding for document 5  is :  [ 9 13 24]


In [11]:
input = Input(shape=(no_docs,maxlen),dtype='float64')
print(input)

Tensor("input_1:0", shape=(None, 5, 3), dtype=float64)


In [12]:
word_input = Input(shape=(maxlen,),dtype='float64')  

# creating the embedding
word_embedding = Embedding(input_dim=vocab_size
                         ,output_dim=8
                         ,input_length=maxlen)(word_input)

word_vec = Flatten()(word_embedding) # flatten
embed_model = Model([word_input],word_vec) # combining all into a Keras model

In [13]:
embed_model.compile(optimizer=keras.optimizers.Adam(lr=1e-3),loss='binary_crossentropy',metrics=['acc']) 

In [14]:
print(type(word_embedding))
print(word_embedding)

<class 'tensorflow.python.framework.ops.Tensor'>
Tensor("embedding/embedding_lookup/Identity_1:0", shape=(None, 3, 8), dtype=float32)


In [15]:
print(embed_model.summary()) # summary of the model

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 3)]               0         
_________________________________________________________________
embedding (Embedding)        (None, 3, 8)              400       
_________________________________________________________________
flatten (Flatten)            (None, 24)                0         
Total params: 400
Trainable params: 400
Non-trainable params: 0
_________________________________________________________________
None


In [16]:
embeddings=embed_model.predict(pad_corp) # finally getting the embeddings.

In [17]:
print("Shape of embeddings : ",embeddings.shape)
print(embeddings)

Shape of embeddings :  (5, 24)
[[-5.3673498e-03 -1.8788934e-02 -4.4557441e-02 -2.6957249e-02
  -6.2922612e-03 -1.4181305e-02 -3.0416477e-02 -2.8175617e-02
   1.7932821e-02  1.7127883e-02  3.8691927e-02 -1.8712878e-02
   3.8859393e-02 -3.1095792e-02  2.3134384e-02 -2.9454781e-02
   4.0332008e-02 -3.6765348e-02  4.9045954e-02 -4.7364235e-03
  -2.8743673e-02 -4.3033481e-02  2.7902950e-02 -3.7243377e-02]
 [-1.7941654e-02 -3.5849582e-02  1.7087534e-04 -3.2103911e-02
  -1.5646923e-02  2.7943339e-02 -4.7287717e-03 -2.8392745e-02
  -1.7254472e-02  1.9911043e-03 -1.3535045e-02  1.8280435e-02
  -1.4121603e-02 -6.5667555e-04  6.1333887e-03 -3.4374021e-02
   4.0332008e-02 -3.6765348e-02  4.9045954e-02 -4.7364235e-03
  -2.8743673e-02 -4.3033481e-02  2.7902950e-02 -3.7243377e-02]
 [-2.2350682e-02 -1.5841924e-02 -3.5081852e-02  4.5776252e-02
  -2.3162497e-02 -4.8936989e-02 -8.7780952e-03 -2.6657630e-02
  -1.7254472e-02  1.9911043e-03 -1.3535045e-02  1.8280435e-02
  -1.4121603e-02 -6.5667555e-04  6.13

In [18]:
embeddings.shape

(5, 24)

In [19]:
embed_model.predict([[0,26,2],[12,9,0]])

array([[ 0.04033201, -0.03676535,  0.04904595, -0.00473642, -0.02874367,
        -0.04303348,  0.02790295, -0.03724338,  0.04961969, -0.02003281,
        -0.04558759,  0.01380639,  0.03038007, -0.0179113 , -0.01952933,
        -0.0267565 , -0.0331312 , -0.00247995, -0.01352979, -0.03143342,
         0.00326723,  0.00689859, -0.0202565 ,  0.03883375],
       [ 0.04982357,  0.04128337, -0.01227608, -0.02850893,  0.01289573,
        -0.01466572,  0.04804214,  0.04450727, -0.03580081,  0.02767037,
         0.03869518,  0.00466995, -0.00035101, -0.03736404,  0.04499766,
        -0.02865592,  0.04033201, -0.03676535,  0.04904595, -0.00473642,
        -0.02874367, -0.04303348,  0.02790295, -0.03724338]],
      dtype=float32)

In [20]:
from sklearn.feature_extraction import DictVectorizer
a = DictVectorizer(sparse=False)

In [30]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
y = [0,1,0,0,0,0]
a = pd.DataFrame([["a","a"],["a","c"],["a","a"],["a","d"],["a","f"],["a","c"]],columns=["un","deux"])
a['bar'] = +a['un']+' '+a['deux']
le = preprocessing.LabelEncoder()
a['bar'] = le.fit_transform(a['bar'])
a

Unnamed: 0,un,deux,bar
0,a,a,0
1,a,c,1
2,a,a,0
3,a,d,2
4,a,f,3
5,a,c,1


In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(a, y, test_size=0.33, random_state=42)

In [34]:
X_train

Unnamed: 0,un,deux,bar
5,a,c,1
2,a,a,0
4,a,f,3
3,a,d,2


---------------
# LSTM EMBEDDING NO PADDING
---------------------

In [56]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, TimeDistributed
from keras.utils import to_categorical
import numpy as np

In [85]:
data = np.asarray([
     tf.convert_to_tensor(np.asarray([10,5,7]).astype(np.float32))
    ,tf.convert_to_tensor(np.asarray([8,16]).astype(np.float32))
    ,tf.convert_to_tensor(np.asarray([2,3]).astype(np.float32))
    ,tf.convert_to_tensor(np.asarray([1,2]).astype(np.float32))
    ,tf.convert_to_tensor(np.asarray([9,9]).astype(np.float32))
])
labels = [1,1,2,3,5]

In [93]:
inp = Input(shape=(None, 20)) 
lstm = tf.keras.layers.LSTM(4)
lstm.inputs(inp)
output = lstm(data)

AttributeError: 'LSTM' object has no attribute 'inputs'

In [87]:
model.fit_generator(data,labels,  verbose=1)

In [1]:
import os

In [40]:
lstm = tf.keras.layers.LSTM(1)

In [41]:
output = lstm(data)

ValueError: Input 0 of layer lstm_5 is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: [1, 3]

------
# FASTTEXT
------

In [18]:
import tensorflow as tf
tf.compat.v1.disable_eager_execution() # right after `import tensorflow as tf` : MEMORY LEAK 

# data visualisation and manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
#configure
# sets matplotlib to inline and displays graphs below the corressponding cell.
%matplotlib inline  
style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)

#nltk
import nltk

#stop-words
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
stop_words=set(nltk.corpus.stopwords.words('english'))

# tokenizing
from nltk import word_tokenize,sent_tokenize

#keras
import keras
from keras.preprocessing.text import one_hot,Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding, Input, LSTM
from keras.models import Model

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [19]:
ft_model = load_model('cc.fr.300.bin')



In [20]:
n_features = ft_model.get_dimension()

In [21]:
my_data = [
    "bateau chien"
    ,"berger mouton chien"
]

In [22]:
dico_vocab = Dictionary() 
list_len_largest_token = -1
encoded_corpus = []
nb_docs = len(my_data)
flatten = lambda l: [item for sublist in l for item in sublist]                
list_len_largest_token = -1
                
fdist = FreqDist()
                
#Determiner la plus grande list de token 
for index_doc, doc in enumerate(my_data):
    tokens = str(doc).split(" ")
    dico_vocab.add_documents([tokens])
    for token in tokens:
        fdist[token.lower()] += 1
    if(list_len_largest_token < len(tokens)):
        list_len_largest_token = len(tokens)

more_than_one = list(filter(lambda x: x[1] >= 2, fdist.items()))

vocab_size = len(dico_vocab) + 1

print(f"vocab_size {vocab_size} (filtré) vs {len(dico_vocab)}")

print(list_len_largest_token)

vocab_size 5 (filtré) vs 4
3


In [23]:
embedding_matrix = np.zeros((vocab_size, 300))

In [24]:
for index_doc, doc in enumerate(my_data):
    tokens = doc.split(" ")
    my_data[index_doc] = tokens
tokenizer = Tokenizer(num_words=vocab_size, lower=True, char_level=False)
tokenizer.fit_on_texts(my_data)

In [25]:
word_seq_train = tokenizer.texts_to_sequences(my_data)
word_seq_train

[[2, 1], [3, 4, 1]]

In [26]:
for x in range(len(word_seq_train)):
    word_seq_train[x] = flatten(pad_sequences([word_seq_train[x]]
                                                          , maxlen = 5
                                                          , padding = 'post'
                                                          , value = 0.0).tolist())

In [27]:
words_not_found = []
embed_dim = 300
embedding_matrix = np.zeros((vocab_size, embed_dim))
for word, i in tokenizer.word_index.items():
    print(i)
    embedding_vector = ft_model[word]
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)
print(f'number of null word embeddings: {np.sum(np.sum(embedding_matrix, axis=1) == 0)}')

1
2
3
4
number of null word embeddings: 1


In [28]:
word_seq_train

[[2, 1, 0, 0, 0], [3, 4, 1, 0, 0]]

In [34]:
# init model
model = Sequential()
# emmbed word vectors
model.add(Embedding(vocab_size
                    ,300
                    , weights = [embedding_matrix]
                    , trainable = False
                    , mask_zero = True ))
# learn the correlations

model.add(LSTM(256,return_sequences=False))

model.compile(optimizer="adam",loss="binary_crossentropy",metrics=['acc'])

In [45]:
full_col_vector = model.predict(np.array(word_seq_train))

In [46]:
full_col_vector.shape

(2, 256)

In [47]:
full_col_vector

array([[-7.89595651e-04, -7.37569109e-03,  2.08314005e-02,
        -1.73259098e-02,  1.71414260e-02, -5.90225682e-03,
         6.87512057e-03,  1.80747602e-02,  2.17018393e-03,
        -1.96348373e-02, -1.64068528e-02,  1.95564907e-02,
         1.39490496e-02,  1.63350031e-02,  3.45415995e-03,
        -6.01316860e-04, -1.36102876e-03, -9.70504899e-03,
        -4.19419585e-03,  6.98120566e-03,  3.29457433e-03,
        -1.02660814e-02,  2.48544998e-02, -8.49526282e-03,
        -2.00961111e-03, -2.67344713e-02, -7.27819139e-03,
        -1.72056891e-02,  1.76884308e-02, -6.51207101e-03,
         2.20525754e-03, -7.55632718e-05,  2.14732271e-02,
         1.47005112e-03, -1.24273049e-02,  1.34349950e-02,
         1.27875777e-02,  9.53956158e-04, -2.79732514e-03,
        -4.61942097e-03,  7.19183311e-03, -7.64042046e-03,
         1.11569688e-02,  3.12791346e-03, -4.70059877e-03,
         2.14979425e-02, -1.72848441e-02,  5.89560485e-03,
        -1.58366766e-02, -1.47881582e-02,  1.05014220e-0

In [15]:
sorted(sklearn.metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'brier_score_loss',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'v_measure_score']

In [4]:
from keras.preprocessing.text import Tokenizer
from gensim.models.fasttext import FastText
import numpy as np
import matplotlib.pyplot as plt
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk import WordPunctTokenizer
import wikipedia

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
artificial_intelligence = wikipedia.page("Artificial Intelligence").content

In [6]:
z = sent_tokenize(artificial_intelligence)

In [7]:
word_punctuation_tokenizer = nltk.WordPunctTokenizer()
word_tokenized_corpus = [word_punctuation_tokenizer.tokenize(sent) for sent in z]

In [8]:
# word_tokenized_corpus

In [9]:
# from gensim.models import FastText  # FIXME: why does Sphinx dislike this import?
from gensim.test.utils import common_texts  # some example sentences

print(common_texts[0])
print(len(common_texts))

model = FastText(size=4, window=3, min_count=1)  # instantiate
model.build_vocab(sentences=common_texts)
model.train(sentences=common_texts, total_examples=len(common_texts), epochs=10)  # train

['human', 'interface', 'computer']
9


In [10]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

In [11]:
list_doc = ["education nationale","gendarmerie nationale","manpower","pomona","starclay","bnp"]

In [12]:
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(list_doc)]

In [55]:
tagged_data

[TaggedDocument(words=['education', 'nationale'], tags=['0']),
 TaggedDocument(words=['gendarmerie', 'nationale'], tags=['1']),
 TaggedDocument(words=['manpower'], tags=['2']),
 TaggedDocument(words=['pomona'], tags=['3']),
 TaggedDocument(words=['starclay'], tags=['4']),
 TaggedDocument(words=['bnp'], tags=['5'])]

In [14]:
max_epochs = 100
vec_size = 20
alpha = 0.025

In [15]:
model = Doc2Vec(size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)

model.build_vocab(tagged_data)



In [16]:
for epoch in range(max_epochs):
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

  after removing the cwd from sys.path.


In [57]:
data = ["education nationale","gendarmerie nationale","manpower","pomona","starclay","bnp"]

In [84]:
train_corpus = tagged_data


model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
model.build_vocab(train_corpus)
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(str(doc_id))
    ranks.append(rank)
    second_ranks.append(sims[1])

In [110]:
sc = []
sc.append(model.infer_vector(['France','Telecom']))
sc.append(model.infer_vector(['Cuisine','Telecom']))

In [111]:
g = pd.DataFrame(sc)

In [116]:
# to find vector of doc in training data using tags or in other words, printing the vector of document at index 1 in training data
print(model.docvecs['1'])

-0.0009312409092672169
-0.008755287155508995


In [119]:
ll = [["1erdoc",'1erdoccol1'],["2èmedoc","2èmedoccol2"]]
pd.DataFrame(ll)

Unnamed: 0,0,1
0,1erdoc,1erdoccol1
1,2èmedoc,2èmedoccol2


In [76]:
ranks

[5, 4, 0, 2, 3, 2]

In [75]:
import collections

counter = collections.Counter(ranks)
print(counter)

Counter({2: 2, 5: 1, 4: 1, 0: 1, 3: 1})


----------------------


In [46]:
corpus = [[['1er doc col','2ème doc']]
df = pd.DataFrame([{"lol":"bernard tuui"},{"lol":"MV POON FALL"},{"lol":"HASTAG DIEZE"}])

In [50]:
corpus.append(df.lol.tolist())
flatten(corpus)

['1er doc',
 '2ème doc',
 'bernard tuui',
 'MV POON FALL',
 'HASTAG DIEZE',
 'bernard tuui',
 'MV POON FALL',
 'HASTAG DIEZE']

In [49]:
flatten = lambda l: [item for sublist in l for item in sublist]

In [7]:
finder.apply_freq_filter(10)

NameError: name 'finder' is not defined

In [8]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [53]:
from multiprocessing import cpu_count
num_cores = cpu_count()
num_cores

72

In [9]:
text = a.fillna('UNKNOW').values

NameError: name 'a' is not defined

In [10]:
corpus_text = [x.split(" ") for sub in text for x in sub]

NameError: name 'text' is not defined

In [11]:
# corpus_text

In [12]:
bigram = gensim.models.Phrases(corpus_text, min_count=2, threshold=1)

NameError: name 'corpus_text' is not defined

In [13]:
# bigram_mod = gensim.models.phrases.Phraser(bigram)

In [14]:
bigram = gensim.models.Phrases(corpus_text)
result = [bigram[line] for line in corpus_text]

NameError: name 'corpus_text' is not defined

In [15]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

In [16]:
data_words_bigrams  = make_bigrams(a.values)

NameError: name 'a' is not defined

In [17]:
for item in a.fillna('UNKNOW').values:
    print(bigram[item])
    break

NameError: name 'a' is not defined

In [18]:
flat_res = [" ".join(x) for x in result]

NameError: name 'result' is not defined

In [19]:
a['rs_x'] = flat_res

NameError: name 'flat_res' is not defined

In [20]:
len(flat_res)

NameError: name 'flat_res' is not defined

----------------

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import gensim.downloader as api
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

from gensim import corpora, matutils, models, similarities
import pandas as pd
import numpy as np

In [2]:
dataset = [['calaamar','pyjama','chateau'],['pyjama', 'karaté','kimono'],['chateau','uruguay'],['papaye']]

dct = Dictionary()
corpus = []
for line in dataset:
    print([line])
    dct.add_documents([line])
    corpus.append(dct.doc2bow(line))
model = TfidfModel(corpus)

[['calaamar', 'pyjama', 'chateau']]
[['pyjama', 'karaté', 'kimono']]
[['chateau', 'uruguay']]
[['papaye']]


In [3]:
tf_sparse_array = matutils.corpus2csc(corpus)

In [4]:
print(tf_sparse_array)

  (0, 0)	1.0
  (1, 0)	1.0
  (2, 0)	1.0
  (2, 1)	1.0
  (3, 1)	1.0
  (4, 1)	1.0
  (1, 2)	1.0
  (5, 2)	1.0
  (6, 3)	1.0


In [5]:
row, col = tf_sparse_array.nonzero()
for row, col in zip(row, col):
    print(str(row) + ' ' + str(col))

0 0
1 0
1 2
2 0
2 1
3 1
4 1
5 2
6 3


In [6]:
def top_n_words(matrix,nword):
    count = {}
    row, col = tf_sparse_array.nonzero()
    for row, col in zip(row, col):
        if row in count:
            count[row] = count[row] + 1
        else:
            count[row] = 1
    words = list(count.items())
    words.sort(key=lambda tup: tup[1],reverse=True)
    words = words[0:nword]
    words = [i[0] for i in words]
    return words

In [7]:
top_n_words(tf_sparse_array,2)

[1, 2]

In [8]:
vector2 = model[corpus[0]]

In [9]:
model.cfs

In [10]:
dct.token2id

{'calaamar': 0,
 'chateau': 1,
 'pyjama': 2,
 'karaté': 3,
 'kimono': 4,
 'uruguay': 5,
 'papaye': 6}

In [11]:
corpus

[[(0, 1), (1, 1), (2, 1)],
 [(2, 1), (3, 1), (4, 1)],
 [(1, 1), (5, 1)],
 [(6, 1)]]

In [12]:
dct.token2id['pyjama']

2

In [13]:
vector2 = model[corpus]
vector2

<gensim.interfaces.TransformedCorpus at 0x7fb347a2c9b0>

In [14]:
tf_cols_name =  {v: k for k, v in dct.token2id.items()}

In [18]:
vector2.corpus

[[(0, 1), (1, 1), (2, 1)],
 [(2, 1), (3, 1), (4, 1)],
 [(1, 1), (5, 1)],
 [(6, 1)]]

In [19]:
vector2[0]

[(0, 0.8164965809277261), (1, 0.4082482904638631), (2, 0.4082482904638631)]

In [20]:
tf_cols_name.values()

dict_values(['calaamar', 'chateau', 'pyjama', 'karaté', 'kimono', 'uruguay', 'papaye'])

In [21]:
for item in tf_cols_name:
    print(tf_cols_name[item])

calaamar
chateau
pyjama
karaté
kimono
uruguay
papaye


In [22]:
pd.DataFrame(columns = colss)

NameError: name 'colss' is not defined

In [23]:
len(tf_cols_name)

7

In [31]:
def tfidf_to_sparse_array(matrix,vocab,tops=[],):
    row = []
    col = []
    val = []
    if len(tops) != 0:
        for index_doc, doc in enumerate(vector2):
            for item in doc:
                if item[0] in tops:
                    col.append(item[0])
                    row.append(index_doc)
                    val.append(item[1])
            A = sparse.csc_matrix((val,(row,col)),shape=(len(vector2.corpus),len(tops)))
    else:
        for index_doc, doc in enumerate(vector2):
            for item in doc:
                col.append(item[0])
                row.append(index_doc)
                val.append(item[1])
            A = sparse.csc_matrix((val,(row,col)),shape=(len(vector2.corpus),len(vocab)))
   
    return A

In [32]:
for item in vector2:
    print(item)

[(0, 0.8164965809277261), (1, 0.4082482904638631), (2, 0.4082482904638631)]
[(2, 0.3333333333333333), (3, 0.6666666666666666), (4, 0.6666666666666666)]
[(1, 0.4472135954999579), (5, 0.8944271909999159)]
[(6, 1.0)]


In [33]:
tops = top_n_words(vector2,2)
print(tops)

[1, 2]


In [34]:
sp = tfidf_to_sparse_array(vector2,tf_cols_name)
print(sp.toarray())

[[0.81649658 0.40824829 0.40824829 0.         0.         0.
  0.        ]
 [0.         0.         0.33333333 0.66666667 0.66666667 0.
  0.        ]
 [0.         0.4472136  0.         0.         0.         0.89442719
  0.        ]
 [0.         0.         0.         0.         0.         0.
  1.        ]]


In [35]:
print(sp[:, 0].toarray().flatten())

[0.81649658 0.         0.         0.        ]


In [37]:
sp.shape

(4, 7)

In [366]:
tf_cols_name.keys()

dict_keys([0, 1, 2, 3, 4, 5, 6])

In [28]:
from gensim.matutils import corpus2dense, corpus2csc

In [424]:
len(vector2.corpus)

4

In [431]:
corpus2csc(vector2).toarray()

array([[0.81649658, 0.        , 0.        , 0.        ],
       [0.40824829, 0.        , 0.4472136 , 0.        ],
       [0.40824829, 0.33333333, 0.        , 0.        ],
       [0.        , 0.66666667, 0.        , 0.        ],
       [0.        , 0.66666667, 0.        , 0.        ],
       [0.        , 0.        , 0.89442719, 0.        ],
       [0.        , 0.        , 0.        , 1.        ]])

In [410]:
def sparceMatrix_to_csv(origin, spmatrix,vocab,tops=[]):
    with open('tmp/dense_tf_idf.csv','w') as f:
        wr = csv.writer(f, delimiter=';')
        wr.writerow([x for x in vocab.values()])
        for index_line in range(0,sp.shape[0]):
            wr.writerow(sp[index_line, :].toarray().flatten())

In [395]:
sp[index_line, :].toarray()

NameError: name 'index_line' is not defined

In [139]:
tops = top_n_words(tf_sparse_array,2)

In [None]:
tfidf_to_sparse_array(vector2,tf_cols_name,tops).head()

In [30]:
from scipy import sparse
from numpy import array
I = array([0,3,1,0])
J = array([0,3,1,2])
V = array([4,5,7,9])
A = sparse.coo_matrix((V,(I,J)),shape=(4,4))

In [197]:
from scipy.sparse import lil_matrix
from scipy.sparse.linalg import spsolve
from numpy.linalg import solve, norm
from numpy.random import rand


A = lil_matrix((1000, 1000))
A[0, :100] = rand(100)
A[1, 100:200] = A[0, :100]
A.setdiag(rand(1000))

In [198]:
A = A.tocsr()
b = rand(1000)
x = spsolve(A, b)

In [201]:
x_ = solve(A.toarray(), b)

In [415]:
corpus2csc(vector2, num_terms, num_docs)

NameError: name 'corpus2csc' is not defined

In [414]:
vector2.__getitem__(0)

[(0, 0.8164965809277261), (1, 0.4082482904638631), (2, 0.4082482904638631)]

In [33]:
scipy.sparse.csr_matrix([[1, 2], [3, 0]])

<2x2 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [34]:
for item in vector2:
    print(item)

[(0, 0.8164965809277261), (1, 0.4082482904638631), (2, 0.4082482904638631)]
[(2, 0.3333333333333333), (3, 0.6666666666666666), (4, 0.6666666666666666)]
[(1, 0.4472135954999579), (5, 0.8944271909999159)]
[(6, 1.0)]


In [35]:
d = {dct.get(id): value for doc in vector2 for id, value in doc}

In [36]:
d

{'calaamar': 0.8164965809277261,
 'chateau': 0.4472135954999579,
 'pyjama': 0.3333333333333333,
 'karaté': 0.6666666666666666,
 'kimono': 0.6666666666666666,
 'uruguay': 0.8944271909999159,
 'papaye': 1.0}

In [37]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [38]:
with open('test.txt') as f:
    lines = [line.rstrip() for line in f]

In [39]:
vect_rs_x = TfidfVectorizer(input='content',max_features=250)
vector_rs_x = vect_rs_x.fit_transform(lines)

In [40]:
tf_cols_name =  {v: k for k, v in vect_rs_x.vocabulary_.items()}
df_vector_rs_x = pd.DataFrame(vector_rs_x.toarray())
df_vector_rs_x.rename(tf_cols_name,axis=1,inplace=True)

In [41]:
df_vector_rs_x

Unnamed: 0,bateau,chat,chien,je,kimono,loup,pyjama,suis,un
0,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.57735,0.57735
1,0.5,0.5,0.5,0.0,0.0,0.5,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.707107,0.0,0.707107,0.0,0.0


In [38]:
import pandas as pd

In [61]:
d = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9],[1,2,3],[4,5,6],[7,8,9],[1,2,3],[4,5,6],[7,8,9]])
d.head()

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9
3,1,2,3
4,4,5,6


In [66]:
d[2:4]

Unnamed: 0,0,1,2
2,7,8,9
3,1,2,3


In [42]:
import s3fs
import os
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': 'http://minio.stable.innovation.insee.eu'})
remote_data_dir="s3://groupe-1033/data"

In [46]:
for file in fs.ls("s3://groupe-1033/production/tfidf/"):
    print(file)

In [74]:
for x in fs.ls("s3://groupe-1033/production/tfidf",refresh=True):
    print(os.path.basename(x))

.keep
baseline.py
test


In [75]:
fs.touch("s3://groupe-1033/production/refined/.keep")

In [3]:
from abc import ABC
from abc import ABCMeta
from abc import abstractmethod

from abc import ABC

In [158]:

import itertools

import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score

import imblearn
from imblearn.datasets import fetch_datasets
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.ensemble import RUSBoostClassifier
from imblearn.over_sampling import SMOTE

from imblearn.metrics import geometric_mean_score

In [159]:
satimage = fetch_datasets()['satimage']
X, y = satimage.data, satimage.target
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
                                                    random_state=0)

In [160]:
oversample = SMOTE()
print(len(y))
X, y = oversample.fit_resample(X, y)
print(len(y))

6435
11618


In [161]:
len(y)

11618

name


In [1]:
import yaml

In [26]:
with open('config.yaml') as f:
        configs = yaml.safe_load(f)
for model in configs['models']:
    for obj in model.keys():
        print(model[obj]['type'])
        arg = model[obj]['param']
#         class_ = getattr(thismodule, model[obj]['type'])
#         instance = class_(path_run = path_run,id_run=id_run)
#         list_model.append(instance)

Tree
XGBoost
LogRegr


In [27]:
print(arg)
arg['id']= 'dd'

{}


In [12]:
from datetime import date
t = date.today()
type(t.isoformat())

str