In [1]:
import pandas as pd
import string

from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

import seaborn as sns
import matplotlib.pyplot as plt

from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
import nltk
import spacy
import fr_core_news_md

In [2]:
df1 = pd.read_csv('raw_data.csv', index_col=0)
df1

Unnamed: 0,activite,code_ape
0,"comptabilité, gestion des entreprises",69.20Z
1,débit de tabac - journaux - papeterie - bimbel...,47.11B
2,vente de contenu informatique.,47.91B
3,"presse, papeterie, jeux fdj, bimbeloterie, lib...",56.30Z
4,site e-commerce : vente en ligne de chocolats,47.91B
...,...,...
57508,Profession d'avocat,69.10Z
57509,"Snack, plats à emporter",56.10C
57510,"Exploitation de restaurant, pub, bar, discothè...",56.10A
57511,"Acquisition, administration, exploitation par ...",68.20B


In [3]:
#preprocessing 
# normalisation
df2 = df1[(df1.activite != "non renseigné")]
stopW = set(stopwords.words('french')) #  necessary dl

In [4]:
df2.info

<bound method DataFrame.info of                                                 activite code_ape
0                  comptabilité, gestion des entreprises   69.20Z
1      débit de tabac - journaux - papeterie - bimbel...   47.11B
2                         vente de contenu informatique.   47.91B
3      presse, papeterie, jeux fdj, bimbeloterie, lib...   56.30Z
4          site e-commerce : vente en ligne de chocolats   47.91B
...                                                  ...      ...
57508                                Profession d'avocat   69.10Z
57509                            Snack, plats à emporter   56.10C
57510  Exploitation de restaurant, pub, bar, discothè...   56.10A
57511  Acquisition, administration, exploitation par ...   68.20B
57512  L'achat et la gestion de centre de remise en f...   93.13Z

[53775 rows x 2 columns]>

In [5]:
df3 = df2[(df2.activite != "aucune d'activite")]
df3

Unnamed: 0,activite,code_ape
0,"comptabilité, gestion des entreprises",69.20Z
1,débit de tabac - journaux - papeterie - bimbel...,47.11B
2,vente de contenu informatique.,47.91B
3,"presse, papeterie, jeux fdj, bimbeloterie, lib...",56.30Z
4,site e-commerce : vente en ligne de chocolats,47.91B
...,...,...
57508,Profession d'avocat,69.10Z
57509,"Snack, plats à emporter",56.10C
57510,"Exploitation de restaurant, pub, bar, discothè...",56.10A
57511,"Acquisition, administration, exploitation par ...",68.20B


In [6]:
df3['activite'] = df3['activite'].str.lower()

In [7]:
nlp = fr_core_news_md.load()

In [8]:
df3['activite'] =  df3['activite'].apply(lambda x: [y.lemma_ for y in  nlp(x)])

In [9]:
#df3['activite'] = df3['activite'].apply(word_tokenize) #tokenization 

In [10]:
df3

Unnamed: 0,activite,code_ape
0,"[comptabilité, ,, gestion, de, entreprise]",69.20Z
1,"[débit, de, tabac, -, journal, -, papeterie, -...",47.11B
2,"[vente, de, contenu, informatique, .]",47.91B
3,"[presse, ,, papeterie, ,, jeu, fdj, ,, bimbelo...",56.30Z
4,"[site, e-commerce, :, vente, en, ligne, de, ch...",47.91B
...,...,...
57508,"[profession, de, avocat]",69.10Z
57509,"[snack, ,, plat, à, emporter]",56.10C
57510,"[exploitation, de, restaurant, ,, pub, ,, bar,...",56.10A
57511,"[acquisition, ,, administration, ,, exploitati...",68.20B


In [11]:
exclude = set(string.punctuation)

In [12]:
stopWs = stopW.union(exclude) # ne fonctionne pas avec append() ni + 

In [13]:
df3['activite'] = df3['activite'].apply(lambda x: [item for item in x if item not in stopWs])
df3

Unnamed: 0,activite,code_ape
0,"[comptabilité, gestion, entreprise]",69.20Z
1,"[débit, tabac, journal, papeterie, bimbeloteri...",47.11B
2,"[vente, contenu, informatique]",47.91B
3,"[presse, papeterie, jeu, fdj, bimbeloterie, li...",56.30Z
4,"[site, e-commerce, vente, ligne, chocolat]",47.91B
...,...,...
57508,"[profession, avocat]",69.10Z
57509,"[snack, plat, emporter]",56.10C
57510,"[exploitation, restaurant, pub, bar, discothèq...",56.10A
57511,"[acquisition, administration, exploitation, ba...",68.20B


In [14]:
from nltk import ngrams # ngramming
df3['activite'] = df3['activite'].apply(lambda row: list(nltk.ngrams(row, 2)))

In [15]:
df3

Unnamed: 0,activite,code_ape
0,"[(comptabilité, gestion), (gestion, entreprise)]",69.20Z
1,"[(débit, tabac), (tabac, journal), (journal, p...",47.11B
2,"[(vente, contenu), (contenu, informatique)]",47.91B
3,"[(presse, papeterie), (papeterie, jeu), (jeu, ...",56.30Z
4,"[(site, e-commerce), (e-commerce, vente), (ven...",47.91B
...,...,...
57508,"[(profession, avocat)]",69.10Z
57509,"[(snack, plat), (plat, emporter)]",56.10C
57510,"[(exploitation, restaurant), (restaurant, pub)...",56.10A
57511,"[(acquisition, administration), (administratio...",68.20B


In [16]:
df_test = df3[(df3.code_ape == "APE Non Diffusable")]

In [17]:
df_test

Unnamed: 0,activite,code_ape
30,"[(convoyage, vehicule), (vehicule, route)]",APE Non Diffusable
31,"[(achat, vente), (vente, ferraille)]",APE Non Diffusable
63,"[(coursier, vélo)]",APE Non Diffusable
69,"[(ménage, aide), (aide, préparation), (prépara...",APE Non Diffusable
89,"[(vente, plat), (plat, préparer), (préparer, e...",APE Non Diffusable
...,...,...
57422,"[(conseil, voyage), (voyage, prestataire), (pr...",APE Non Diffusable
57459,"[(vente, vêtement), (vêtement, internet)]",APE Non Diffusable
57465,"[(location, voiture), (voiture, commerce), (co...",APE Non Diffusable
57467,"[(chauffeur, livreur), (livreur, repas), (repa...",APE Non Diffusable


In [19]:
df_training = df3[(df3.code_ape != "APE Non Diffusable")]
df_training  

Unnamed: 0,activite,code_ape
0,"[(comptabilité, gestion), (gestion, entreprise)]",69.20Z
1,"[(débit, tabac), (tabac, journal), (journal, p...",47.11B
2,"[(vente, contenu), (contenu, informatique)]",47.91B
3,"[(presse, papeterie), (papeterie, jeu), (jeu, ...",56.30Z
4,"[(site, e-commerce), (e-commerce, vente), (ven...",47.91B
...,...,...
57508,"[(profession, avocat)]",69.10Z
57509,"[(snack, plat), (plat, emporter)]",56.10C
57510,"[(exploitation, restaurant), (restaurant, pub)...",56.10A
57511,"[(acquisition, administration), (administratio...",68.20B
