In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

## Libraries and Filesystem Setup

In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import numpy as np

[nltk_data] Downloading package punkt to /Users/dkarsann/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dkarsann/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### (already have imported / processed dataframe and stopwords)

### load stopword lists from MastersProjectStopwords.ipynb

In [4]:
common_stopwords = pickle.load(open("../processed_files/common_stopwords.pickle", "rb"))
corpus_stopwords = pickle.load(open("../processed_files/corpus_stopwords.pickle", "rb"))
combined_stopwords = pickle.load(open("../processed_files/combined_stopwords.pickle", "rb"))
corpus_bigrams = pickle.load(open("../processed_files/corpus_bigrams.pickle", "rb"))
corpus_trigrams = pickle.load(open("../processed_files/corpus_trigrams.pickle", "rb"))

## Parsing and Processing Content

In [5]:
df_en = pd.read_csv('../processed_files/only_en.csv')
df_en.head()

Unnamed: 0,DOMAIN,RAW_CONTENT,LANGUAGE
0,x10-hk.com,automation@home » » | | quick find categories ...,en
1,hupshenghware.com,captcha powered by imunify360 english hupsheng...,en
2,soponyonosnack.com,soponyonosnack.com currencies: rupiah language...,en
3,theromanticvineyard.com,wine train (blogroll) | the romantic vineyard ...,en
4,eopticians.co.uk,"brands, base curve (bc): 8.4, base curve (bc):...",en


## Merging duplicate domain

In [6]:
df_merged = pd.DataFrame(df_en.groupby('DOMAIN')['RAW_CONTENT'].agg('sum')).reset_index()
df_merged.head()

Unnamed: 0,DOMAIN,RAW_CONTENT
0,0-my.com,0-my.com related searches: related searches:
1,00bitz.com,error. page cannot be displayed. please contac...
2,01187.com,01187.com is available for purchase! - wwwv1.c...
3,027.ir,"027.ir - dns4.ir 027.ir hits: 7,267 under cons..."
4,03h.org,february | 2014 | online marketing review sear...


In [7]:
number_domain = df_merged['DOMAIN'].nunique()
number_domain

12968

In [8]:
df_merged.shape

(12968, 2)

In [9]:
df_merged.head()

Unnamed: 0,DOMAIN,RAW_CONTENT
0,0-my.com,0-my.com related searches: related searches:
1,00bitz.com,error. page cannot be displayed. please contac...
2,01187.com,01187.com is available for purchase! - wwwv1.c...
3,027.ir,"027.ir - dns4.ir 027.ir hits: 7,267 under cons..."
4,03h.org,february | 2014 | online marketing review sear...


## TEXT Processing

In [10]:
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import WhitespaceTokenizer
tokenizer = WhitespaceTokenizer()
from nltk.corpus import stopwords 
stop_words = set(stopwords.words('english')) 
from nltk.stem import PorterStemmer
ps = PorterStemmer()
from string import digits
from IPython.display import display
import string

In [11]:
remove_digits = str.maketrans('', '', digits)
remove_punctuation = str.maketrans('', '', string.punctuation)

In [12]:
def process(content):
    
    #--------remove domain name--------#
    #processed = row['RAW_CONTENT'].replace(row['DOMAIN'], '')
    
    #--------no digits--------#
    processed = content.translate(remove_digits)
    #print('remove digits step')
    #print(processed)
    
    #--------remove punction--------#
    processed = processed.translate(remove_punctuation)
    
    #--------lower case--------#
    processed = processed.lower()
    #print('lowercase step')
    #print(processed)
    
    #--------remove trigrams--------#
    for trigram in corpus_trigrams:
        processed = processed.replace(trigram.lower(), '')
    #print('trigram step')
    #print(processed)
        
    #--------remove bigrams--------#
    for bigram in corpus_bigrams:
        processed = processed.replace(bigram.lower(), '') 
    #print('bigram step')
    #print(processed)
    
    #--------tokenize--------#
    #print('tokenize step')
    processed = tokenizer.tokenize(processed)
    #print('tokenize step after')
    #print(processed)
    
    #--------remove stopwords--------#
    processed = [token for token in processed if token not in stop_words]
    #print('stopwords step')
    #print(processed)
    
    return processed

In [13]:
def stem_processed(content):
    
    #--------stem tokens--------#
    processed = [ps.stem(token) for token in content]
    #print('stem step')
    #print(processed)
    
    return processed

In [14]:
df_merged['RAW_CONTENT_PROCESSED'] = df_merged['RAW_CONTENT'].apply(lambda row: process(row))
df_merged.head()

Unnamed: 0,DOMAIN,RAW_CONTENT,RAW_CONTENT_PROCESSED
0,0-my.com,0-my.com related searches: related searches:,"[mycom, related, searches, related, searches]"
1,00bitz.com,error. page cannot be displayed. please contac...,"[error, page, cannot, displayed, please, conta..."
2,01187.com,01187.com is available for purchase! - wwwv1.c...,"[com, vailable, purchase, wwwvcom, welcome, uu..."
3,027.ir,"027.ir - dns4.ir 027.ir hits: 7,267 under cons...","[ir, dnsir, ir, hits, construction, coming, so..."
4,03h.org,february | 2014 | online marketing review sear...,"[february, online, marketing, review, search, ..."


In [18]:
df_merged['RAW_CONTENT_PROCESSED_STEMMED'] = df_merged['RAW_CONTENT_PROCESSED'].apply(lambda row: stem_processed(row))
df_merged.head()

Unnamed: 0,DOMAIN,RAW_CONTENT,RAW_CONTENT_PROCESSED,RAW_CONTENT_PROCESSED_STEMMED
0,0-my.com,0-my.com related searches: related searches:,"[mycom, related, searches, related, searches]","[mycom, relat, search, relat, search]"
1,00bitz.com,error. page cannot be displayed. please contac...,"[error, page, cannot, displayed, please, conta...","[error, page, cannot, display, pleas, contact,..."
2,01187.com,01187.com is available for purchase! - wwwv1.c...,"[com, vailable, purchase, wwwvcom, welcome, uu...","[com, vailabl, purchas, wwwvcom, welcom, uunic..."
3,027.ir,"027.ir - dns4.ir 027.ir hits: 7,267 under cons...","[ir, dnsir, ir, hits, construction, coming, so...","[ir, dnsir, ir, hit, construct, come, soon, ho..."
4,03h.org,february | 2014 | online marketing review sear...,"[february, online, marketing, review, search, ...","[februari, onlin, market, review, search, rece..."


In [19]:
with open('../processed_files/df_processed.pickle', 'wb') as file:
    pickle.dump(df_merged, file)