In [3]:
"""
-Most ML algos can't input raw text; instead we must extract features to pass numerical input
-counter vectorization creates a Document Term Matrix counting term appearances per document
 -For a large corpus, we'll have a sparse Document Term Matrix with many 0s
-an alternative to count vectorization is TF-IDF, Term Frequency Inverse Document Frequency
 -term frequency tf(t,d) is the raw count of term occurrences in document
 -Inverse Doc Frequency factor decreases weight of frequent terms, increases weight of rare terms
 -It is the logarithmically scaled inverse fraction of documents containing word, reflecting the
 word's importance to a document in a corpus
 -logarithm of (# of docs in corpus)/(# of docs containing target term)
 TF-IDF = term freq / inverse doc freq = term freq / (1/doc freq)
 TF-IDF = tf(t,d)*idf(t,D)
 idf(t,D) = log(N/(|{d \in D: t \in d}|))
 -in 2015, 83% of text-based recommender systems in digital libraries used TF-IDF
 -A simple ranking function can sum the tf-idf for each query term; complex ranking functions vary 
 on this theme
 -weighting factor benefits user modeling by mining text for information retrieval
"""
#1. create a corpus
"""
%%writefile 1.txt
this is a story about cats
our feline pets
cats are furry animals

%%writefile 2.txt
this is a story about surfing
catching waves is fun
surfing is a popular water sport
"""

'\n%%writefile 1.txt\nthis is a story about cats\nour feline pets\ncats are furry animals\n\n%%writefile 2.txt\nthis is a story about surfing\ncatching waves is fun\nsurfing is a popular water sport\n'

In [4]:
#1. Vocabulary Creation
vocab = {}
i = 1

with open('../1.txt') as f:
    x = f.read().lower().split()
    
for word in x:
    if word in vocab:
        continue
    else:
        vocab[word]=i
        i+=1

print(vocab)

{'this': 1, 'is': 2, 'a': 3, 'story': 4, 'about': 5, 'cats': 6, 'our': 7, 'feline': 8, 'pets': 9, 'are': 10, 'furry': 11, 'animals': 12}


In [6]:
with open('../2.txt') as f:
    x = f.read().lower().split()

    #assign each word a unique id in the vocab
for word in x:
    if word in vocab:
        continue
    else:
        vocab[word]=i
        i+=1
print(vocab)

{'this': 1, 'is': 2, 'a': 3, 'story': 4, 'about': 5, 'cats': 6, 'our': 7, 'feline': 8, 'pets': 9, 'are': 10, 'furry': 11, 'animals': 12, 'surfing': 13, 'catching': 14, 'waves': 15, 'fun': 16, 'popular': 17, 'water': 18, 'sport': 19}


In [11]:
#FEATURE EXTRACTION: start by creating an empty vector with spaces for each vocab word
one = ['1.txt']+[0]*len(vocab)
print("initialized vector one: \n", one)
#map frequencies of each word in 1.txt to the vector
with open('../1.txt') as f:
    x = f.read().lower().split()

for word in x:
    one[vocab[word]]+=1

    #most words appear once, "cats" appears twice
print("\n filled vector one: \n", one)

two = ['2.txt']+[0]*len(vocab)
print("initialized vector two: \n", two)
#map frequencies of each word in 1.txt to the vector
with open('../2.txt') as f:
    x = f.read().lower().split()

for word in x:
    two[vocab[word]]+=1

print("\n filled vector two: \n", two)

initialized vector one: 
 ['1.txt', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

 filled vector one: 
 ['1.txt', 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]
initialized vector two: 
 ['2.txt', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

 filled vector two: 
 ['2.txt', 1, 3, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1]


In [20]:
import numpy as np
import pandas as pd

df = pd.read_csv('../smsspamcollection copy.tsv',sep='\t')
print("Set: \n", df.head())
print("\n Null check: \n", df.isnull().sum())
print("\n value counts: \n", df['label'].value_counts())
from sklearn.model_selection import train_test_split
#By convention, X capitalized and y lowercase because X=larger matrix and y=1d array
X=df['message']
y=df['label']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=42)

from sklearn.feature_extraction.text import CountVectorizer
count_vect=CountVectorizer()
#Fit the vectorizer to the data, build vocab, count words
#you will almost always perform the fit and transform steps together:
#count_vect.fit(X_train)
#X_train_counts = count_vect.transform(X_train)
#transform original text message to vector, a fit transform convenience method is provided by skl
X_train_counts = count_vect.fit_transform(X_train)
#print(X_train_counts) #can't be viewed because it's a huge sparse matrix
#print(X_train.shape)  #will tell you # of data rows
print("\n training data rows & vocab size:", X_train_counts.shape) 


Set: 
   label                                            message  length  punct
0   ham  Go until jurong point, crazy.. Available only ...     111      9
1   ham                      Ok lar... Joking wif u oni...      29      6
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...     155      6
3   ham  U dun say so early hor... U c already then say...      49      6
4   ham  Nah I don't think he goes to usf, he lives aro...      61      2

 Null check: 
 label      0
message    0
length     0
punct      0
dtype: int64

 value counts: 
 ham     4825
spam     747
Name: label, dtype: int64

 training data rows & vocab size: (3733, 7082)


In [25]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_t = TfidfTransformer()
X_train_tfidf = tfidf_t.fit_transform(X_train_counts)
print("X train tfidf shape:", X_train_tfidf.shape)
#The inbuild SKL TfIdf vectorizer can perform this for you too
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train_tfidf,y_train)

X train tfidf shape: (3733, 7082)


LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [32]:
#We can also combine all of the above as a single pipeline, provided by SKL
from sklearn.pipeline import Pipeline
#You can now provide training data and have it vectorized and classified in 1 step
text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])
print("Pipeline guts: \n", text_clf.fit(X_train,y_train))
predictions = text_clf.predict(X_test)
from sklearn.metrics import confusion_matrix, classification_report
print("\n Confusion matrix: \n", confusion_matrix(y_test,predictions))
print("\n Classification Report: \n", classification_report(y_test,predictions))
from sklearn import metrics
#this output should have much better scoring than our previous model
metrics.accuracy_score(y_test,predictions)

Pipeline guts: 
 Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

 Confusion matrix: 
 [[1586    7]
 [  12  234]]

 Classification Report: 
               precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1593
        spam       0.97      0.95      0.96       246

   micro avg       0.99      0.99      0.99      1839
   macro avg       0.98      0.97      0.98      1839
weighted avg       0.99      0.99      0.99      1839



0.989668297988037

In [34]:
#now, use the trained model to make predictions on NEW input
print("first classification(ham): \n", text_clf.predict(['Hi how are you?']))
print("second classification(spam): \n", text_clf.predict(['Congrats! You won. Text WON to 48950 congratulations free entry']))

first classification(ham): 
 ['ham']
second classification(spam): 
 ['spam']
