In [1]:
import numpy as np
import pandas as pd

from pathlib import Path


In [3]:
current_path = Path('.').absolute()
print(current_path)
data_path = str(current_path) + '/data/smsspamcollection.tsv'
df = pd.read_csv(data_path, sep='\t')
df.head()

/usr/src/app/notebooks


Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [4]:
# Processing the data: Check if null data exists
df.isnull() # Returns dataset with booleans (True if value is null, False if not)

Unnamed: 0,label,message,length,punct
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
5,False,False,False,False
6,False,False,False,False
7,False,False,False,False
8,False,False,False,False
9,False,False,False,False


In [5]:
# If values are 0, not null data is found
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [None]:
# Accesing Columns
df['label']
#print(type(df['label']))
#print(dir(df['label']))
labels = df['label'].unique()
values_info = df['label'].value_counts()
print(labels)
print(values_info)

In [None]:
import matplotlib.pyplot as plt

plt.xscale('log')
bins = 1.15**(np.arange(0,50))
plt.hist(df[df['label']=='ham']['length'],bins=bins,alpha=0.8)
plt.hist(df[df['label']=='spam']['length'],bins=bins,alpha=0.8)
plt.legend(('ham','spam'))
plt.show()

In [None]:
# Making a simple ML model
from sklearn.model_selection import train_test_split

X = df[['length', 'punct']] # Data
y = df['label'] # labels

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)
print(X_train.head())
print(X_test.head())

In [None]:
# Train a ML model
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(solver='lbfgs')
lr_model.fit(X_train, y_train)

In [None]:
# Model ready to predict
from sklearn import metrics

predictions = lr_model.predict(X_test)
print(predictions)

In [None]:
# Now we can compare real values (y_test) with predictions
metrics.confusion_matrix(y_test, predictions)

In [None]:
df = pd.DataFrame(metrics.confusion_matrix(y_test,predictions), index=['ham','spam'], columns=['ham','spam'])
df

In [None]:
print(metrics.classification_report(y_test,predictions))
print(f"Accuracy: {metrics.accuracy_score(y_test,predictions)}")