# Modelli di machine learning sui testi

In [1]:
import pandas as pd
import nltk

In [2]:
# http://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection

In [3]:
df = pd.read_csv('smsspamcollection/SMSSpamCollection', sep='\t', names=["label", "text"])

In [4]:
print(df.head())

  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [5]:
print(df.groupby('label').describe())

                                                           text
label                                                          
ham   count                                                4825
      unique                                               4516
      top                                Sorry, I'll call later
      freq                                                   30
spam  count                                                 747
      unique                                                653
      top     Please call our customer service representativ...
      freq                                                    4


In [6]:
def low_text(input):
    return "".join([i.lower() for i in input])

In [7]:
df.text = df.text.map(lambda x: low_text(x))

In [8]:
print(df.head())

  label                                               text
0   ham  go until jurong point, crazy.. available only ...
1   ham                      ok lar... joking wif u oni...
2  spam  free entry in 2 a wkly comp to win fa cup fina...
3   ham  u dun say so early hor... u c already then say...
4   ham  nah i don't think he goes to usf, he lives aro...


In [9]:
sw = set(nltk.corpus.stopwords.words('english'))

In [10]:
print(sw)

{'our', 'herself', 'didn', 'i', 'd', 're', 'isn', 'wouldn', 'these', 'more', 'such', 'be', 'm', 'mustn', 'shouldn', 'a', 'your', 'now', 'can', 'above', 'through', 'o', 't', 'those', 'should', 'been', 'y', 'it', 'they', 'don', 'ours', 'are', 'about', 'or', 'before', 'their', 'below', 'only', 'him', 'were', 'did', 'down', 'into', 'has', 'from', 'most', 'at', 'up', 'off', 'shan', 'doesn', 'theirs', 'while', 'her', 'is', 'same', 'being', 'having', 'out', 'on', 'themselves', 'needn', 'its', 'each', 'myself', 'ourselves', 'this', 'very', 'over', 'all', 'hers', 'by', 'who', 'against', 'himself', 'few', 'do', 'the', 'just', 'have', 'am', 'won', 'his', 'some', 've', 'my', 'ma', 'hasn', 'he', 's', 'when', 'what', 'with', 'in', 'itself', 'than', 'as', 'an', 'wasn', 'own', 'then', 'after', 'not', 'other', 'does', 'them', 'further', 'where', 'she', 'there', 'to', 'll', 'of', 'and', 'if', 'yourself', 'aren', 'you', 'me', 'ain', 'doing', 'why', 'haven', 'any', 'once', 'until', 'mightn', 'weren', 'had

In [11]:
df.shape

(5572, 2)

In [12]:
cl = {'ham': 1, 'spam': 0}
df['label'] = df['label'].map(cl)

In [13]:
print(df.head())

   label                                               text
0      1  go until jurong point, crazy.. available only ...
1      1                      ok lar... joking wif u oni...
2      0  free entry in 2 a wkly comp to win fa cup fina...
3      1  u dun say so early hor... u c already then say...
4      1  nah i don't think he goes to usf, he lives aro...


In [17]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 5572):
    text = re.sub('[^a-zA-Z]', ' ', df['text'][i])
    text = text.lower()
    text = text.split()
    ps = PorterStemmer()
    text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]
    text = ' '.join(text)
    corpus.append(text)

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
cv = CountVectorizer(max_features = 2000)
x = cv.fit_transform(corpus).toarray()
cl = df['label'].values

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
x_train, x_test, y_train, y_test = train_test_split(x, cl, test_size = 0.3, random_state = 12345)

In [22]:
# Regressione Logistica

In [23]:
from sklearn.linear_model.logistic import LogisticRegression

In [24]:
lr = LogisticRegression()

In [25]:
lr.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [26]:
lr_pred = lr.predict(x_test)

In [27]:
from sklearn.metrics import confusion_matrix, classification_report

In [28]:
print(confusion_matrix(y_test, lr_pred))

[[ 216   25]
 [   9 1422]]


In [29]:
print(classification_report(y_test, lr_pred))

             precision    recall  f1-score   support

          0       0.96      0.90      0.93       241
          1       0.98      0.99      0.99      1431

avg / total       0.98      0.98      0.98      1672



In [30]:
# Support Vector Machines

In [31]:
from sklearn.linear_model import SGDClassifier

In [32]:
from sklearn.metrics import accuracy_score

In [33]:
clf = SGDClassifier()

In [34]:
clf.fit(x_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [35]:
clf_pred = clf.predict(x_test)

In [36]:
print(confusion_matrix(y_test, clf_pred))

[[ 220   21]
 [  12 1419]]


In [37]:
print(classification_report(y_test, clf_pred))

             precision    recall  f1-score   support

          0       0.95      0.91      0.93       241
          1       0.99      0.99      0.99      1431

avg / total       0.98      0.98      0.98      1672



In [38]:
# Naive Bayes

In [39]:
from sklearn.naive_bayes import GaussianNB

In [40]:
nb = GaussianNB()

In [41]:
nb.fit(x_train, y_train)

GaussianNB(priors=None)

In [42]:
nb_pred = nb.predict(x_test)

In [43]:
print(confusion_matrix(y_test, nb_pred))

[[ 212   29]
 [ 235 1196]]


In [44]:
print(classification_report(y_test, nb_pred))

             precision    recall  f1-score   support

          0       0.47      0.88      0.62       241
          1       0.98      0.84      0.90      1431

avg / total       0.90      0.84      0.86      1672



In [45]:
from sklearn.naive_bayes import MultinomialNB

In [46]:
nb2 = MultinomialNB()

In [47]:
nb2.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [48]:
nb2_pred = nb2.predict(x_test)

In [49]:
print(confusion_matrix(y_test, nb2_pred))

[[ 228   13]
 [  23 1408]]


In [50]:
print(classification_report(y_test, nb2_pred))

             precision    recall  f1-score   support

          0       0.91      0.95      0.93       241
          1       0.99      0.98      0.99      1431

avg / total       0.98      0.98      0.98      1672



In [51]:
# Alberi di decisione

In [52]:
from sklearn.tree import DecisionTreeClassifier

In [53]:
dt = DecisionTreeClassifier(criterion='gini', max_depth=2, min_samples_split=2)

In [54]:
dt.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [55]:
dt_pred = dt.predict(x_test)

In [56]:
print(confusion_matrix(y_test, dt_pred))

[[ 161   80]
 [  59 1372]]


In [57]:
print(classification_report(y_test, dt_pred))

             precision    recall  f1-score   support

          0       0.73      0.67      0.70       241
          1       0.94      0.96      0.95      1431

avg / total       0.91      0.92      0.92      1672

