In [127]:
#Importation des bibliothèques et du CSV
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time

df_original = pd.read_csv('spam.csv', encoding="ISO-8859-1")
df_original.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


# Nettoyage

In [128]:
#Retrait des colonnes inutiles
df = df_original[['v1', 'v2']].copy()
df.rename(columns={"v1": "label", "v2": "content"}, inplace = True)
df.head()

Unnamed: 0,label,content
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [129]:
print(df.label.value_counts(normalize = True))
#Proportion 'ham' : 4825/5572 = 86.6%

ham     0.865937
spam    0.134063
Name: label, dtype: float64


In [130]:
df.drop_duplicates(inplace = True)
#Retrait des lignes identiques 

print(df.label.value_counts(normalize = True))
#Proportion 'ham' : 4516/5169 = 87.4%
#Le Ratio ham/spam est donc conservé à 0.8% près après retrait des doublons.

df.describe()

ham     0.87367
spam    0.12633
Name: label, dtype: float64


Unnamed: 0,label,content
count,5169,5169
unique,2,5169
top,ham,"Go until jurong point, crazy.. Available only ..."
freq,4516,1


In [131]:
df.to_csv('spam_clean.csv',index = False)
#export du dataframe nettoyé dans le dossier de travail sous le nom de 'spam_clean.csv'

# Features

In [133]:
df = pd.read_csv('spam_clean.csv')

In [136]:
#Ajout d'une feature length contenant le nombre de caractère compris dans le SMS
df['length'] = df['content'].apply(len)
df

Unnamed: 0,label,content,length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61
...,...,...,...
5164,spam,This is the 2nd time we have tried 2 contact u...,161
5165,ham,Will Ì_ b going to esplanade fr home?,37
5166,ham,"Pity, * was in mood for that. So...any other s...",57
5167,ham,The guy did some bitching but I acted like i'd...,125


In [138]:
df.to_csv('spam_features.csv',index = False)

Unnamed: 0,label,content,length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61
...,...,...,...
5164,spam,This is the 2nd time we have tried 2 contact u...,161
5165,ham,Will Ì_ b going to esplanade fr home?,37
5166,ham,"Pity, * was in mood for that. So...any other s...",57
5167,ham,The guy did some bitching but I acted like i'd...,125


# Preprocessing

In [146]:
# data
import pandas as pd

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, OneHotEncoder, StandardScaler, RobustScaler, MinMaxScaler


# Pipeline and model
from sklearn.pipeline import Pipeline
from sklearn import set_config
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import CategoricalNB, GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

# Score of models
from sklearn.metrics import accuracy_score

In [139]:
df = pd.read_csv('spam_features.csv')
print(df.head(),'\n\n Nombre de null :',df.content.isnull().sum())

  label                                            content  length
0   ham  Go until jurong point, crazy.. Available only ...     111
1   ham                      Ok lar... Joking wif u oni...      29
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...     155
3   ham  U dun say so early hor... U c already then say...      49
4   ham  Nah I don't think he goes to usf, he lives aro...      61 

 Nombre de null : 0


In [140]:
#Target preprocessing
lb_binar = LabelBinarizer()
y = lb_binar.fit_transform(df.label)
type(y), y
# ham : 0, spam : 1

(numpy.ndarray,
 array([[0],
        [0],
        [1],
        ...,
        [0],
        [0],
        [0]]))

In [141]:
#Features preprocessing
X = df.drop(columns = 'label')
X.head()

Unnamed: 0,content,length
0,"Go until jurong point, crazy.. Available only ...",111
1,Ok lar... Joking wif u oni...,29
2,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,U dun say so early hor... U c already then say...,49
4,"Nah I don't think he goes to usf, he lives aro...",61


In [142]:
#Selection des features catégoriques
column_cat = df.select_dtypes(include=['object']).columns.drop(['content'])
column_cat

Index(['label'], dtype='object')

In [143]:
#Selection des features numériques
column_num = df.select_dtypes(include=['int','float'])
print(column_num)
#Pipeline de preprocessing des features numériques
transformer_num = Pipeline(steps = [
    ('imputation', SimpleImputer(strategy='median')), 
    ('scaling', MinMaxScaler())
])

      length
0        111
1         29
2        155
3         49
4         61
...      ...
5164     161
5165      37
5166      57
5167     125
5168      26

[5169 rows x 1 columns]


In [149]:
#Features Columns Transformer
preparation = ColumnTransformer(transformers = [
    ('data_num', transformer_num, column_num)
])

In [150]:
#Création du modèle
model = DecisionTreeClassifier()

pipe_model = Pipeline(steps=[
    ('preparation', preparation),
    ('model', model)])

set_config(display = 'diagram')
pipe_model