In [82]:
!pip install -U imbalanced-learn



In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

In [6]:
corpus = ["This is a brown house. This house is big. The street number is 1.",
          "This is a small house. This house has 1 bedroom. The street number is 12.",
          "This dog is brown. This dog likes to play.",
          "The dog is in the bedroom."]
     

In [10]:
# Binary Term Frequency - captures presence (1) or absence (0) of a term in a documnent.

tv = TfidfVectorizer(binary=True, norm=None, use_idf=False, smooth_idf=False, lowercase=True, stop_words='english', 
                     token_pattern=r'(?u)\b[A-Za-z]+\b', 
                     min_df=1, 
                     max_df=1.0, 
                     max_features=None, 
                     ngram_range=(1, 1))

In [12]:
df = pd.DataFrame(tv.fit_transform(corpus).toarray(),columns=tv.get_feature_names_out())

In [14]:
df

Unnamed: 0,bedroom,big,brown,dog,house,likes,number,play,small,street
0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
2,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# Bag-of-words Term Frequency

tv = TfidfVectorizer(binary=False, norm=None, use_idf=False, smooth_idf=False, stop_words='english', lowercase=True, ngram_range=(1,1), max_features=None, max_df=1.0, min_df=1,token_pattern=r'(?u)\b[A-Za-z]+\b')


In [20]:
df = pd.DataFrame(tv.fit_transform(corpus).toarray(), columns=tv.get_feature_names_out())

In [22]:
df

Unnamed: 0,bedroom,big,brown,dog,house,likes,number,play,small,street
0,0.0,1.0,1.0,0.0,2.0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,1.0,1.0
2,0.0,0.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
# Normalized Term Frequency 

#normalizirano - l1 
tv = TfidfVectorizer(binary=False, stop_words='english', max_features=None, norm='l1', use_idf=False, max_df=1.0 , min_df=0.1, token_pattern=r'(?u)\b[A-Za-z]+\b', ngram_range=(1,1))

In [26]:
df = pd.DataFrame(tv.fit_transform(corpus).toarray(), columns=tv.get_feature_names_out())

In [28]:
df

Unnamed: 0,bedroom,big,brown,dog,house,likes,number,play,small,street
0,0.0,0.166667,0.166667,0.0,0.333333,0.0,0.166667,0.0,0.0,0.166667
1,0.166667,0.0,0.0,0.0,0.333333,0.0,0.166667,0.0,0.166667,0.166667
2,0.0,0.0,0.2,0.4,0.0,0.2,0.0,0.2,0.0,0.0
3,0.5,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
# Normalized TF-IDF 

#normalization l2
tv = TfidfVectorizer(binary=False, stop_words='english', max_features=None, norm='l2',smooth_idf=False, lowercase=True, use_idf=False , max_df=1.0 , min_df=0.1, token_pattern=r'(?u)\b[A-Za-z]+\b', ngram_range=(1,1))

In [38]:
df = pd.DataFrame(tv.fit_transform(corpus).toarray(), columns=tv.get_feature_names_out())

In [40]:
df

Unnamed: 0,bedroom,big,brown,dog,house,likes,number,play,small,street
0,0.0,0.353553,0.353553,0.0,0.707107,0.0,0.353553,0.0,0.0,0.353553
1,0.353553,0.0,0.0,0.0,0.707107,0.0,0.353553,0.0,0.353553,0.353553
2,0.0,0.0,0.377964,0.755929,0.0,0.377964,0.0,0.377964,0.0,0.0
3,0.707107,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Text Classification

In [63]:
!gdown 1rmX4GzVy9kKzwPjtaC0WYR34iYmb7Beu

Downloading...
From: https://drive.google.com/uc?id=1rmX4GzVy9kKzwPjtaC0WYR34iYmb7Beu
To: /Users/davidhristov/Desktop/Intro to Data Science/DataScience/Auditory/Aud10-NLP /SPAM text message 20170820 - Data.csv
100%|█████████████████████████████████████████| 486k/486k [00:04<00:00, 116kB/s]


In [42]:
df = pd.read_csv('datasets/SPAM text message 20170820 - Data.csv')

In [44]:
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [46]:
from sklearn.model_selection import train_test_split

x = df['Message']
y = df['Category']

In [48]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [54]:
from collections import Counter

print(f"Training class distributions summary: {Counter(y_train)}")
print(f"Test class distributions summary: {Counter(y_test)}")

Training class distributions summary: Counter({'ham': 3850, 'spam': 607})
Test class distributions summary: Counter({'ham': 975, 'spam': 140})


In [62]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline


model = make_pipeline(TfidfVectorizer(), MultinomialNB())

model.fit(x_train, y_train)

In [64]:
y_pred = model.predict(x_test)

In [84]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       975
        spam       1.00      0.71      0.83       140

    accuracy                           0.96      1115
   macro avg       0.98      0.85      0.90      1115
weighted avg       0.96      0.96      0.96      1115

