In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('./resources/smsspamcollection.tsv', sep='\t')
print(df.head())
X = df['message']
y= df['label']

  label                                            message  length  punct
0   ham  Go until jurong point, crazy.. Available only ...     111      9
1   ham                      Ok lar... Joking wif u oni...      29      6
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...     155      6
3   ham  U dun say so early hor... U c already then say...      49      6
4   ham  Nah I don't think he goes to usf, he lives aro...      61      2


In [None]:
#following is an example to use count vectorizer using sklearn

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train_counts = count_vect.fit_transform(X_train);
print(X_train_counts)

  (0, 4814)	1
  (0, 3201)	1
  (0, 2948)	1
  (0, 6866)	1
  (0, 4628)	1
  (0, 3774)	1
  (0, 3786)	1
  (0, 4937)	1
  (0, 3620)	1
  (0, 4089)	1
  (0, 3666)	1
  (0, 961)	1
  (0, 3534)	1
  (1, 3620)	1
  (1, 7756)	1
  (1, 7507)	1
  (1, 1052)	1
  (1, 2995)	1
  (1, 1630)	1
  (1, 3163)	1
  (1, 5026)	1
  (1, 4701)	1
  (1, 820)	1
  (1, 2608)	1
  (1, 4721)	1
  :	:
  (4454, 376)	1
  (4454, 655)	1
  (4454, 4630)	1
  (4454, 2125)	1
  (4454, 110)	1
  (4454, 351)	1
  (4455, 3620)	1
  (4455, 3666)	1
  (4455, 1630)	1
  (4455, 7460)	2
  (4455, 4684)	1
  (4455, 3536)	1
  (4455, 3181)	1
  (4455, 4447)	1
  (4455, 5932)	1
  (4455, 2500)	1
  (4456, 3137)	1
  (4456, 2194)	1
  (4456, 6862)	1
  (4456, 6993)	1
  (4456, 6099)	1
  (4456, 2806)	1
  (4456, 3794)	1
  (4456, 4554)	1
  (4456, 6272)	1


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer #Combination of TfidfTransofrmer and CountVectorizer
tf_vec = TfidfVectorizer()
X_train_idf = tf_vec.fit_transform(X_train)
from sklearn.svm import LinearSVC
model = LinearSVC()
model.fit(X_train_idf,y_train);
X_test_idf = tf_vec.transform(X_test)
y_predict = model.predict(X_test_idf)


In [13]:
from sklearn import metrics
# Print a confusion matrix
print(metrics.confusion_matrix(y_test,y_predict))
print(metrics.classification_report(y_test,y_predict))
print(metrics.accuracy_score(y_test,y_predict))

[[954   1]
 [  9 151]]
              precision    recall  f1-score   support

         ham       0.99      1.00      0.99       955
        spam       0.99      0.94      0.97       160

    accuracy                           0.99      1115
   macro avg       0.99      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115

0.9910313901345291


In [None]:
#with multiple features

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.svm import LinearSVC
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split

# Define a custom transformer to extract other features (if any)
class ItemSelector(TransformerMixin, BaseEstimator):
    """This class allows you to select a subset of a dataframe based on given column name(s)."""
    def __init__(self, keys):
        self.keys = keys

    def fit(self, x, y=None):
        return self

    def transform(self, dataframe):
        return dataframe[self.keys]
        
# Define a custom transformer to extract other features (if any)
class OtherFeatureExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Transform other features
        return X[['length', 'punct']].values  # Assuming these are the other features

# Define the pipeline
pipeline = Pipeline(
    [
    ('features', 
     FeatureUnion
     (
        transformer_list=
         [
            ('bag-of-words', Pipeline([
                ('selector', ItemSelector(keys='message')),
                ('vectorizer', TfidfVectorizer()),
            ]))
            #  ,
            # ('votes', Pipeline([
            #     ('selector', ItemSelector(keys=['length', 'punct'])),
            #     ('other_features', OtherFeatureExtractor())
            # ]))
         ],
         transformer_weights={
            'bag-of-words': 1.0,
            # 'votes': 0.1
        },
     )
    ),
    ('classifier', LinearSVC())  # Classifier
])
df = pd.read_csv('./resources/smsspamcollection.tsv', sep='\t')
X = df[['message', 'length', 'punct']]
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Fit the pipeline on training data
pipeline.fit(X_train, y_train)

# Predict using the trained pipeline
y_predict = pipeline.predict(X_test)

from sklearn import metrics
# Print a confusion matrix
print(metrics.confusion_matrix(y_test,y_predict))
print(metrics.classification_report(y_test,y_predict))
print(metrics.accuracy_score(y_test,y_predict))


[[954   1]
 [  9 151]]
              precision    recall  f1-score   support

         ham       0.99      1.00      0.99       955
        spam       0.99      0.94      0.97       160

    accuracy                           0.99      1115
   macro avg       0.99      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115

0.9910313901345291
