### Importing essential libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

#### Load Dataset

In [4]:
df = pd.read_csv('spam.csv', sep=',', encoding='latin-1', usecols=lambda col: col not in ["Unnamed: 2","Unnamed: 3","Unnamed: 4"])

In [5]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


#### Rename names of the columns

In [6]:
df = df.rename(columns={"v1":"label", "v2":"text"})

In [7]:
df.head(5)

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


#### Word Counts with CounterVectorizer

In [9]:
vect = CountVectorizer()

#### Splitting the dta into training and test

In [10]:
X_train,X_test,y_train,y_test = train_test_split(df["text"],df["label"], test_size = 0.2, random_state = 10)

#### Fitting the CountVectorizer using the training data

In [11]:
vect.fit(X_train)

CountVectorizer()

#### Transforming the dataframes into vectors

In [12]:
X_train_df = vect.transform(X_train)
X_test_df = vect.transform(X_test)
type(X_train_df)

scipy.sparse._csr.csr_matrix

#### Create Model 

In [13]:
model = LogisticRegression()
model.fit(X_train_df,y_train)

LogisticRegression()

#### Making predictions

In [14]:
prediction = dict()
prediction["Logistic"] = model.predict(X_test_df)

#### Evaluate Model

In [15]:
accuracy_score(y_test,prediction["Logistic"])

0.97847533632287

In [16]:
print(classification_report(y_test,prediction["Logistic"]))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       965
        spam       0.98      0.86      0.91       150

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115

