In [1]:
%matplotlib inline

import pandas as pd

df = pd.read_csv('./contest1_data/train.csv')
print(df.head(5))

   Id  Popularity                                       Page content
0   0          -1  <html><head><div class="article-info"> <span c...
1   1           1  <html><head><div class="article-info"><span cl...
2   2           1  <html><head><div class="article-info"><span cl...
3   3          -1  <html><head><div class="article-info"><span cl...
4   4          -1  <html><head><div class="article-info"><span cl...


In [4]:
import re
import bleach
from bs4 import BeautifulSoup
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


nltk.download('stopwords')
stop = stopwords.words('english')
print(stop)
# stop.append('image')
# stop.append('data')
# stop.append('see')
# stop.append('also')
# stop.append('video')


def clean(text):
    # remove HTML tags
#     text = text.get_text()
    text = BeautifulSoup(text, 'html.parser')
    text2 = text.find(class_="article-content").get_text()
    text1 = text.find(class_="article-topics").get_text()
    text3 = text.find(class_="title").get_text()
    text = text1 + text2 + text3
    # regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
    
    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
#     print(text)
    return text

def tokenizer_stem_nostop(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

def preprocessor():
    # remove urls
#     text = BeautifulSoup(text, 'html.parser')
    
#     for a in text.findAll('a'):
#         del a['href']
     
#     title = text.findAll(class_="title")
#     tags = text.find(class_="article-topics")
#     article_content = text.find(class_="article-content")
#     print('\n\n\ntags = ', tags)
#     print('title = ', title)
#     print('article-content =', article_content)
#     title = clean(title)
#     tags = clean(tags)
#     article_content = clean(article_content)
    
#     print('\n\n\ntags = ', tags)
#     print('title = ', title)
#     print('article-content =', article_content)
    
#     title = tokenizer_stem_nostop(title)
#     tags = tokenizer_stem_nostop(tags)
#     article_content = tokenizer_stem_nostop(article_content)
#     print('\n\n\ntags = ', tags)
#     print('title = ', title)
#     print('article-content =', article_content)
    
#     print(text)
    
# #     df['Page content'] = text
    doc = df['Page content'].iloc[:]
    from sklearn.feature_extraction.text import TfidfVectorizer

    tfidf = TfidfVectorizer(ngram_range=(1,1),
                            preprocessor=clean,
                            tokenizer=tokenizer_stem_nostop)

    tfidf.fit(doc)

    top = 10
    # get idf score of vocabularies
    idf = tfidf.idf_
    print('[vocabularies with smallest idf scores]')
    sorted_idx = idf.argsort()

    for i in range(top):
        print('%s: %.2f' %(tfidf.get_feature_names()[sorted_idx[i]], idf[sorted_idx[i]]))

    doc_tfidf = tfidf.transform(doc).toarray()
    tfidf_sum = np.sum(doc_tfidf, axis=0)
    print("\n[vocabularies with highest tf-idf scores]")
    for tok, v in zip(tfidf.inverse_transform(np.ones(tfidf_sum.shape[0]))[0][tfidf_sum.argsort()[::-1]][:top], \
                            np.sort(tfidf_sum)[::-1][:top]):
        print('{}: {}'.format(tok, v))
    
    
    
#     return article_content
#     return title, tags, article_content
    
# preprocess
# for i in range(5):
# preprocessor()
#     Page_content = preprocessor(df.loc[i,'Page content'])
#     df.loc[i,'Page content'] = Page_content
#     df.loc[i,'tags'] = tags
#     df.loc[i,'title'] = title
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score

# randomly sample 1000 examples
df_small = df.sample(n=1000, random_state=0)
print(df_small['Page content'])
pipe3 = Pipeline([('vect', TfidfVectorizer(preprocessor=clean, 
                                           tokenizer=tokenizer_stem_nostop)), 
                  ('clf', LogisticRegression(solver = "liblinear"))])
# KNeighborsClassifier(n_neighbors=11, p=2, metric='minkowski')
scores = cross_val_score(estimator=pipe3, X=df_small['Page content'], y=df_small['Popularity'], \
                         cv=10, scoring='roc_auc')

print('AUC: %.3f (+/-%.3f)' % (scores.mean(), scores.std()))
    
    

[nltk_data] Downloading package stopwords to /home/hhliao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [5]:
print(df.head(5))

   Id  Popularity                                       Page content
0   0          -1  <html><head><div class="article-info"> <span c...
1   1           1  <html><head><div class="article-info"><span cl...
2   2           1  <html><head><div class="article-info"><span cl...
3   3          -1  <html><head><div class="article-info"><span cl...
4   4          -1  <html><head><div class="article-info"><span cl...


In [6]:
def tokenizer(text):
    return re.split('\s+', text.strip())

print(tokenizer('runners like running and thus they run'))

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']
