Objectifs :
- Appliquer un traitement classique du jeu de données ;
- Implémenter le bag of word ;
- Implémenter le TF-IDF.

## Importations des packages

In [None]:
from nltk.tokenize import word_tokenize
from nltk.text import Text
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer
import nltk

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

## Importations des données

Les données sont issue de [cette base de données](https://cs.nyu.edu/~kcho/DMQA/).

Maintenant que nos outils sont chargés, nous allons charger nos données.

Cliquez sur le lien ci-dessous :

https://drive.google.com/drive/folders/12OmusfAUOcoLOCwEc--nfkKQ5eEozU45?usp=sharing

Cliquer droit sur le dossier data et appuyer sur ajouter à mon drive.

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


Les données sont maintenant dans votre environnement collab.

In [None]:
import os
print(os.listdir('gdrive/MyDrive/Exercice_1/Partie_1')[:10])

['0a3ff2f0a147c158845afa44d2a012064896566b.story', '0a3fff5779a8f7cfdde5d284a429ab89fd5e85df.story', '0a0f56ebc5a0a67ed18de79d99b40a42d8058d04.story', '0a3ad75d92c5bc2eccf2763df86afe5ddeffed75.story', '0a3f2400ba4e5cdf4b3638ae6fb60fdfa12a2680.story', '0a3f567efff9f0748b2758c9e8c17dc66beade04.story', '0a05b14962b2e73bbff82086762e0e23d32b359f.story', '0a1ad82d161d90d758240407cb8c8fcebff4a212.story', '0a4ec4d37683347ca62b53982d2c5f4efb86f444.story', '0a4b2d4ea5fb0625e3e747525062f0a85345e4df.story']


# Création de la base de données

In [None]:
cmpt = 0
dict_data = dict()

for file_name in os.listdir('gdrive/MyDrive/Exercice_1/Partie_1'):
    f = open('gdrive/MyDrive/Exercice_1/Partie_1/'+file_name, 'r')
    lst = ""
    for line in f:

       line.strip()
       line = line.replace("\n" ,'')
       line = line.replace("//" , '')
       line = line.replace("/" , '')
       if len(line) > 0 :
        lst += line

    dict_data[cmpt] = lst
    cmpt += 1


In [None]:
len(dict_data)

50

# Prétraitements

## En minuscule

In [None]:
dict_data_min = dict()
for k, v in dict_data.items():
  dict_data_min[k] = v.lower()

## Tokenization

In [None]:
dict_data_token = dict()
for k, v in dict_data_min.items():
  tokenizer = nltk.RegexpTokenizer(r'\w+')
  tokens = tokenizer.tokenize(v)
  dict_data_token[k] = tokens

## Stopwords

In [None]:
dict_data_stop = dict()
for k, v in dict_data_token.items():
  dict_data_stop[k] = [w for w in v if not w in list(nltk.corpus.stopwords.words())]

## Stemming

In [None]:
dict_data_stem = dict()
st = LancasterStemmer()

for index, doc in dict_data_stop.items()
  dict_data_stem[index] = [st.stem(w) for w in doc ]

# Bag of words

In [None]:
vocabulary = set()

for index, doc in dict_data_stem.items() :
  for w in doc :
    vocabulary.add(w)

print(len(vocabulary))

4150


In [None]:
data = pd.DataFrame([], columns=vocabulary)

for index, doc in dict_data_stem.items() :

  data = data.append(pd.DataFrame([np.zeros(4150)], columns=vocabulary))
  data = data.reset_index(drop=True)

  for w in doc :
    data.loc[index, w] = data.loc[index, w] + 1

In [None]:
data.head()

Unnamed: 0,near,homebuy,refer,harmony,netflix,900,constitut,resourc,rav,memoir,architect,famin,drug,pond,today,conf,peac,velvet,835,tend,czech,hour,off,vicy,iphon,bar,northwest,barcod,norad,darcy,52,pro,everybody,complaint,abras,build,carl,oust,leg,northwestern,...,chant,convers,excerpt,profit,34,exceiv,lengthy,crew,quo,delivery,null,push,boston,kilowat,zhang,hil,pres,contrast,pt,newsbad,multipl,hid,psychy,brisk,venu,boy,want,stir,describ,retain,kin,food,ub,glob,highlightpérez,spend,overlap,mem,pie,spokesperson
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,3.0,6.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


# Calcul du TF-IDF

## Calcul de la fréquence des mots dans le jeu de données

On créer un dictionnaire *doc_freq* qui comptabilise la fréquence de chaque mot dans les documents de notre jeu de données

In [None]:
doc_freq = {}
for index, doc in dict_data_stem.items():
  for w in np.unique(doc) :
    try :
      doc_freq[w] += 1
    except :
      doc_freq[w] = 1

In [None]:
len(doc_freq)

4150

## Calcul du TF-IDF pour chacun des documents

In [None]:
data = pd.DataFrame([], columns=vocabulary)

tf_idf = dict()
for index, doc in dict_data_stem.items():
    word_freq = dict()
    for w in doc :
      try :
        word_freq[w] += 1
      except :
        word_freq[w] = 1

    data = data.append(pd.DataFrame([np.zeros(4150)], columns=vocabulary))
    data = data.reset_index(drop=True)
    for w in np.unique(doc):
        tf = word_freq[w]/len(np.unique(doc))

        idf = np.log(len(dict_data_stem)/(doc_freq[w]+1))

        data.loc[index, w] = tf_idf[index, w] = tf*idf

In [None]:
data.head()

Unnamed: 0,near,homebuy,refer,harmony,netflix,900,constitut,resourc,rav,memoir,architect,famin,drug,pond,today,conf,peac,velvet,835,tend,czech,hour,off,vicy,iphon,bar,northwest,barcod,norad,darcy,52,pro,everybody,complaint,abras,build,carl,oust,leg,northwestern,...,chant,convers,excerpt,profit,34,exceiv,lengthy,crew,quo,delivery,null,push,boston,kilowat,zhang,hil,pres,contrast,pt,newsbad,multipl,hid,psychy,brisk,venu,boy,want,stir,describ,retain,kin,food,ub,glob,highlightpérez,spend,overlap,mem,pie,spokesperson
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005427,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011177,0.007995,0.0,0.017862,0.010642,0.0,0.0,0.0,0.0,0.0,0.011177,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00877,0.0,0.0,0.0,0.0,0.006363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.019812,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005522,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.030361,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006168,0.011025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011579,0.0,0.0,0.0,0.01634,0.0,0.046315,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011579,0.0,0.0,0.0,0.0,0.0,0.007072,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009085,0.0,0.005789,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010321,0.0,0.0,0.0,0.0,0.0,0.0,0.009002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008044,0.0,0.0,0.0,0.0,0.0,0.0,0.008448,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.006629,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006044,0.0,0.0,0.0,0.0
