In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

## Libraries and Filesystem Setup

In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import numpy as np
import scipy

[nltk_data] Downloading package punkt to /Users/dkarsann/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dkarsann/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### (already have imported / processed dataframe and stopwords)

### load stopword lists from MastersProjectStopwords.ipynb

In [5]:
common_stopwords = pickle.load(open("../../processed_files/stopwords/common_stopwords.pickle", "rb"))
corpus_stopwords = pickle.load(open("../../processed_files/stopwords/corpus_stopwords.pickle", "rb"))
combined_stopwords = pickle.load(open("../../processed_files/stopwords/combined_stopwords.pickle", "rb"))
corpus_bigrams = pickle.load(open("../../processed_files/stopwords/corpus_bigrams.pickle", "rb"))
corpus_trigrams = pickle.load(open("../../processed_files/stopwords/corpus_trigrams.pickle", "rb"))

## Parsing and Processing Content

In [6]:
df_en = pd.read_csv('../../processed_files/only_en.csv')
df_en.head()

Unnamed: 0,DOMAIN,RAW_CONTENT,LANGUAGE
0,x10-hk.com,automation@home » » | | quick find categories ...,en
1,hupshenghware.com,captcha powered by imunify360 english hupsheng...,en
2,soponyonosnack.com,soponyonosnack.com currencies: rupiah language...,en
3,theromanticvineyard.com,wine train (blogroll) | the romantic vineyard ...,en
4,eopticians.co.uk,"brands, base curve (bc): 8.4, base curve (bc):...",en


## Merging duplicate domain

In [7]:
df_merged = pd.DataFrame(df_en.groupby('DOMAIN')['RAW_CONTENT'].agg('sum')).reset_index()
df_merged.head()

Unnamed: 0,DOMAIN,RAW_CONTENT
0,0-my.com,0-my.com related searches: related searches:
1,00bitz.com,error. page cannot be displayed. please contac...
2,01187.com,01187.com is available for purchase! - wwwv1.c...
3,027.ir,"027.ir - dns4.ir 027.ir hits: 7,267 under cons..."
4,03h.org,february | 2014 | online marketing review sear...


In [8]:
number_domain = df_merged['DOMAIN'].nunique()
number_domain

12975

In [9]:
df_merged.shape

(12975, 2)

In [10]:
df_merged.head()

Unnamed: 0,DOMAIN,RAW_CONTENT
0,0-my.com,0-my.com related searches: related searches:
1,00bitz.com,error. page cannot be displayed. please contac...
2,01187.com,01187.com is available for purchase! - wwwv1.c...
3,027.ir,"027.ir - dns4.ir 027.ir hits: 7,267 under cons..."
4,03h.org,february | 2014 | online marketing review sear...


## TEXT Processing

In [11]:
#ini_string = "Geeks123for127geeks"
#  
# printing initial ini_string
#print("initial string : ", ini_string)
#  
# using translate and digits
# to remove numeric digits from string
#remove_digits = str.maketrans('', '', digits)
#res = ini_string.translate(remove_digits)
#res

In [12]:
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import WhitespaceTokenizer
tokenizer = WhitespaceTokenizer()
# tokenizer = RegexpTokenizer(r'\w+')
#tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
from nltk.corpus import stopwords 
stop_words = set(stopwords.words('english')) 
from nltk.stem import PorterStemmer
ps = PorterStemmer()
from string import digits
from IPython.display import display
import string

In [13]:
remove_digits = str.maketrans('', '', digits)
remove_punctuation = str.maketrans('', '', string.punctuation)

In [14]:
def process(content):
    
    #--------remove domain name--------#
    #processed = row['RAW_CONTENT'].replace(row['DOMAIN'], '')
    
    #--------no digits--------#
    processed = content.translate(remove_digits)
    #print('remove digits step')
    #print(processed)
    
    #--------remove punction--------#
    processed = processed.translate(remove_punctuation)
    
    #--------lower case--------#
    processed = processed.lower()
    #print('lowercase step')
    #print(processed)
    
    #--------remove trigrams--------#
    for trigram in corpus_trigrams:
        processed = processed.replace(trigram.lower(), '')
    #print('trigram step')
    #print(processed)
        
    #--------remove bigrams--------#
    for bigram in corpus_bigrams:
        processed = processed.replace(bigram.lower(), '') 
    #print('bigram step')
    #print(processed)
    
    #--------tokenize--------#
    #print('tokenize step')
    processed = tokenizer.tokenize(processed)
    #print('tokenize step after')
    #print(processed)
    
    #--------remove stopwords--------#
    processed = [token for token in processed if token not in stop_words]
    #print('stopwords step')
    #print(processed)
    
    return processed

In [15]:
def stem_processed(content):
    
    #--------stem tokens--------#
    processed = [ps.stem(token) for token in content]
    #print('stem step')
    #print(processed)
    
    return processed

In [16]:
def dummy(content):
    return content

In [17]:
#df_merged['RAW_CONTENT_PARSED'] = df_merged['RAW_CONTENT'].apply(lambda row: row.translate(remove_digits))
#df_merged.head()

In [18]:
#process(df_merged['RAW_CONTENT'].iloc[1])

In [19]:
df_merged['RAW_CONTENT_PROCESSED'] = df_merged['RAW_CONTENT'].apply(lambda row: process(row))
df_merged.head()

Unnamed: 0,DOMAIN,RAW_CONTENT,RAW_CONTENT_PROCESSED
0,0-my.com,0-my.com related searches: related searches:,"[mycom, related, searches, related, searches]"
1,00bitz.com,error. page cannot be displayed. please contac...,"[error, page, cannot, displayed, please, conta..."
2,01187.com,01187.com is available for purchase! - wwwv1.c...,"[com, vailable, purchase, wwwvcom, welcome, uu..."
3,027.ir,"027.ir - dns4.ir 027.ir hits: 7,267 under cons...","[ir, dnsir, ir, hits, construction, coming, so..."
4,03h.org,february | 2014 | online marketing review sear...,"[february, online, marketing, review, search, ..."


In [20]:
df_merged['RAW_CONTENT_PROCESSED_STEMMED'] = df_merged['RAW_CONTENT_PROCESSED'].apply(lambda row: stem_processed(row))
df_merged.head()

Unnamed: 0,DOMAIN,RAW_CONTENT,RAW_CONTENT_PROCESSED,RAW_CONTENT_PROCESSED_STEMMED
0,0-my.com,0-my.com related searches: related searches:,"[mycom, related, searches, related, searches]","[mycom, relat, search, relat, search]"
1,00bitz.com,error. page cannot be displayed. please contac...,"[error, page, cannot, displayed, please, conta...","[error, page, cannot, display, pleas, contact,..."
2,01187.com,01187.com is available for purchase! - wwwv1.c...,"[com, vailable, purchase, wwwvcom, welcome, uu...","[com, vailabl, purchas, wwwvcom, welcom, uunic..."
3,027.ir,"027.ir - dns4.ir 027.ir hits: 7,267 under cons...","[ir, dnsir, ir, hits, construction, coming, so...","[ir, dnsir, ir, hit, construct, come, soon, ho..."
4,03h.org,february | 2014 | online marketing review sear...,"[february, online, marketing, review, search, ...","[februari, onlin, market, review, search, rece..."


In [21]:
with open('../../processed_files/df_merged.pickle', 'wb') as file:
    pickle.dump(df_merged, file)

In [22]:
#df_merged.head().style.set_properties(subset=['RAW_CONTENT'], **{'width-min': '100px'})

## BOW - count vectorizer

In [23]:
%%time
#vect = CountVectorizer(tokenizer=process)
vect = CountVectorizer(tokenizer = dummy, preprocessor = dummy)
#corpus = df_merged['RAW_CONTENT'].tolist()

CPU times: user 84 µs, sys: 750 µs, total: 834 µs
Wall time: 1.53 ms


In [24]:
# not stemmed
corpus = df_merged['RAW_CONTENT_PROCESSED'].tolist()
BOW = vect.fit_transform(corpus)

In [25]:
# stemmed
corpus_stemmed = df_merged['RAW_CONTENT_PROCESSED_STEMMED'].tolist()
BOW_stemmed = vect.fit_transform(corpus_stemmed)

In [26]:
len(df_merged)

12975

In [27]:
type(BOW)

scipy.sparse.csr.csr_matrix

In [26]:
bow_pd = pd.DataFrame(BOW.toarray())

In [27]:
bow_stemmed_pd = pd.DataFrame(BOW_stemmed.toarray())

In [28]:
#bow_mat = np.matrix(BOW.tolist())

In [29]:
BOW.shape

(12975, 310919)

In [30]:
type(BOW)

scipy.sparse.csr.csr_matrix

In [32]:
#bow_stemmed_mat = np.matrix(bow_pd.tolist())

In [33]:
BOW_stemmed.shape

(12975, 273168)

In [34]:
type(BOW_stemmed)

scipy.sparse.csr.csr_matrix

In [35]:
##np.save('../../processed_files/bow_matrix.npy', bow_pd)

In [36]:
##np.save('../../processed_files/bow_stemmed_matrix.npy', bow_stemmed_pd)

In [28]:
scipy.sparse.save_npz('../../processed_files/bow/bow_matrix.npz', BOW)

In [29]:
scipy.sparse.save_npz('../../processed_files/bow/stemmed_bow_matrix.npz', BOW_stemmed)

In [27]:
#with open('../../processed_files/bow.pickle', 'wb') as file:
#    pickle.dump(bow_mat, file)

In [28]:
#with open('../../processed_files/bow_stemmed.pickle', 'wb') as file:
#    pickle.dump(bow_stemmed_mat, file)

## Generating LDA Matrices

In [29]:
#from sklearn.decomposition import LatentDirichletAllocation

In [30]:
#%%time
#lda_10 = LatentDirichletAllocation(n_components = 10, random_state = 0)
#LDA_10_mat = lda_10.fit_transform(BOW_stemmed)

In [31]:
#%%time
#lda_25 = LatentDirichletAllocation(n_components = 25, random_state = 0)
#LDA_25_mat = lda_25.fit_transform(BOW_stemmed)

In [32]:
#%%time
#lda_50 = LatentDirichletAllocation(n_components = 50, random_state = 0)
#LDA_50_mat = lda_50.fit_transform(BOW_stemmed)

In [33]:
#print(LDA_10_mat.shape)
#print(LDA_25_mat.shape)
#print(LDA_50_mat.shape)

In [34]:
#LDA_10_mat[0]

In [35]:
#LDA_25_mat[0]

In [36]:
#LDA_50_mat[0]

In [166]:
#lda_10 = LatentDirichletAllocation(n_components=10, random_state=0)

In [167]:
#%%time
#LDA_mat_10 = lda_10.fit_transform(BOW)

In [168]:
#LDA_mat_10.shape

In [169]:
#LDA_mat_10[0]

In [170]:
#with open('../../processed_files/LDA_10_matrix.pickle', 'wb') as file:
#    pickle.dump(LDA_10_mat, file)

In [171]:
#with open('../../processed_files/LDA_25_matrix.pickle', 'wb') as file:
#    pickle.dump(LDA_25_mat, file)

In [172]:
#with open('../../processed_files/LDA_50_matrix.pickle', 'wb') as file:
#    pickle.dump(LDA_50_mat, file)

## Generating TFIDF Matrix

In [1]:
#if self.count_vec is None:
#    cv = CountVec()
#    self.count_vec = cv.countvec(self.processed_tokens)
#tfidf_transformer = TfidfTransformer()
#tf_idf_mat = tfidf_transformer.fit_transform(self.count_vec)
#return tf_idf_mat 