In [1]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from bs4 import BeautifulSoup
import unicodedata

In [2]:
df = pd.read_json('../data/data.json')

In [3]:
from scrubbington import scrub_everything

In [4]:
feature_names = ['fraud_no_fraud', 'description']

In [5]:
df_test, y, X = scrub_everything(df, feature_names)

In [6]:
df_desc = df_test[feature_names]

In [7]:
df_desc.head()

Unnamed: 0,fraud_no_fraud,description
0,True,"<p><a href=""http://s432.photobucket.com/albums..."
1,False,"<p>Join us for a quick, one-night, community-b..."
2,False,"<h3><span class=""subcategory""><strong>Teacher ..."
3,False,"<p style=""margin-bottom: 1.3em; padding-bottom..."
4,False,<p>Writers and filmmakers need to understand t...


In [8]:
contents = df_desc['description']   

In [9]:
type(contents)

pandas.core.series.Series

In [10]:
tokenizer = RegexpTokenizer(r"[\w']+")  # \w is alpha numeric
stem = PorterStemmer().stem 
stop_set = set(stopwords.words('english'))
normalize = (unicodedata.normalize('NFKD',contents[1]).encode('ASCII', 'ignore').decode('utf-8'))

In [11]:
def tokenize(text):
    tokens = tokenizer.tokenize(text)
    stems = [stem(token) for token in tokens if token not in stop_set]
    return stems

In [12]:
#normalize = (unicodedata.normalize('NFKD',stems).encode('ASCII', 'ignore').decode('utf-8'))

In [13]:
#filtered = [contents[i] for i in range(len(contents)) ]

In [14]:
#docs = [tokenize(filtered[i]) for i in range(len(filtered))]

In [15]:
# vectorizer_model = TfidfVectorizer(tokenizer=tokenize, max_features=5000)
# vectorizer_model.fit(contents)
# vocabulary = np.array(vectorizer_model.get_feature_names())

In [16]:
# vectorizer_model.transform(X) is scipy.sparse.csr.csr_matrix
def vectorizer(X):
    return vectorizer_model.transform(X).toarray()

In [17]:
# X is the tfidt ndarray
# X = vectorizer(contents)

## Cleaning up html by using BeautifulSoup

In [18]:
soup = BeautifulSoup(contents[0], 'html.parser')

In [19]:
clean = soup.get_text()

In [20]:
len_contents = len(contents)
L_soup = []
for i in range(len_contents):
    soup = BeautifulSoup(contents[i], 'html.parser')
    clean = soup.get_text()
    L_soup.append(clean)

In [21]:
soup_contents = pd.Series(L_soup)
type(soup_contents)

pandas.core.series.Series

## Using fit_transform method

In [22]:
tfidfvect = TfidfVectorizer(tokenizer=tokenize, max_features=5000)
tfidf_vectorized = tfidfvect.fit_transform(soup_contents)

In [23]:
l1 = tfidf_vectorized.toarray().shape[0]
l1

14337

In [24]:
# vocabulary
words_tfidf = np.array(tfidfvect.get_feature_names())
words_tfidf

array(["'", "'http", "'s", ..., 'être', 'ø', 'ü'], dtype='<U17')

## Cosine Similiarity using TFIDF

In [39]:
#compute the cosine similarity between two documents
from sklearn.metrics.pairwise import linear_kernel

cosine_similarities = linear_kernel(tfidf_vectorized, tfidf_vectorized)
cosine_similarities[:5, :5]

array([[1.        , 0.07462183, 0.03437887, 0.02655797, 0.03772806],
       [0.07462183, 1.        , 0.05199325, 0.02755338, 0.04859924],
       [0.03437887, 0.05199325, 1.        , 0.07222794, 0.02427998],
       [0.02655797, 0.02755338, 0.07222794, 1.        , 0.04052318],
       [0.03772806, 0.04859924, 0.02427998, 0.04052318, 1.        ]])

In [26]:
#iterate over all possible pairs
# for i, doc1 in enumerate(docs):
#     for j, doc2 in enumerate(docs):
#         print(i, j, cosine_similarities[i, j])
        
for i in range(5):
    for j in range(5):
        print(i, j, cosine_similarities[i, j])

0 0 1.0
0 1 0.0746218260960477
0 2 0.03437887384058543
0 3 0.02655796830475224
0 4 0.03772805931100954
1 0 0.0746218260960477
1 1 1.0
1 2 0.05199325337240666
1 3 0.027553382846540207
1 4 0.0485992354069345
2 0 0.03437887384058543
2 1 0.05199325337240666
2 2 0.9999999999999998
2 3 0.0722279364166268
2 4 0.024279976834080425
3 0 0.02655796830475224
3 1 0.027553382846540207
3 2 0.0722279364166268
3 3 1.0
3 4 0.04052317826682281
4 0 0.03772805931100954
4 1 0.0485992354069345
4 2 0.024279976834080425
4 3 0.04052317826682281
4 4 0.9999999999999992


In [42]:
sum(sum(cosine_similarities))

5102318.386350261

### Cosine Similarities for Fraud only

In [27]:
df_desc_true = df_desc[df_desc['fraud_no_fraud']==True]

In [28]:
df_desc_true.head()

Unnamed: 0,fraud_no_fraud,description
0,True,"<p><a href=""http://s432.photobucket.com/albums..."
26,True,"<h1 class=""post"">Welcome</h1>\r\n<p> </p>\r\n<..."
51,True,"<p><span style=""font-size: x-large;"">Okay ever..."
54,True,"<h3><strong><strong><span style=""font-family: ..."
70,True,


In [29]:
contents_fraud = df_desc_true['description']  

In [30]:
len_contents2 = len(contents_fraud)
L_soup_true = []
for idx, val in contents_fraud.iteritems():
    soup = BeautifulSoup(val, 'html.parser')
    clean = soup.get_text()
    L_soup_true.append(clean)

In [31]:
soup_contents2 = pd.Series(L_soup_true)
type(soup_contents2)

pandas.core.series.Series

In [32]:
tfidfvect2 = TfidfVectorizer(tokenizer=tokenize, max_features=5000)
tfidf_vectorized2 = tfidfvect2.fit_transform(soup_contents2)

In [33]:
# vocabulary
words_tfidf2 = np.array(tfidfvect2.get_feature_names())
words_tfidf2

array(["'", "'0'", "'60", ..., 'để', 'định', 'œuvr'], dtype='<U34')

In [34]:
l2 = tfidf_vectorized2.toarray().shape[0]
l2

1293

In [38]:
#compute the cosine similarity between two documents
from sklearn.metrics.pairwise import linear_kernel

cosine_similarities2 = linear_kernel(tfidf_vectorized2, tfidf_vectorized2)
cosine_similarities2[:5, :5]

array([[1.        , 0.01794965, 0.12737963, 0.02500359, 0.        ],
       [0.01794965, 1.        , 0.        , 0.04186673, 0.        ],
       [0.12737963, 0.        , 1.        , 0.00576628, 0.        ],
       [0.02500359, 0.04186673, 0.00576628, 1.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ]])

In [41]:
sum(sum(cosine_similarities2))

21111.964808366436