In [2]:
from sklearn.metrics import accuracy_score, mean_squared_error, roc_curve, auc
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from xgboost import plot_importance
from xgboost import plot_tree

In [3]:
%matplotlib inline
import pandas as pd

df = pd.read_csv('C:/Users/stat-pc/Desktop/深度學習/train.csv')
print(df.head(5))

   Id  Popularity                                       Page content
0   0          -1  <html><head><div class="article-info"> <span c...
1   1           1  <html><head><div class="article-info"><span cl...
2   2           1  <html><head><div class="article-info"><span cl...
3   3          -1  <html><head><div class="article-info"><span cl...
4   4          -1  <html><head><div class="article-info"><span cl...


In [4]:
testdata = pd.read_csv('C:/Users/stat-pc/Desktop/深度學習/test.csv')
print(df.head(5))

   Id  Popularity                                       Page content
0   0          -1  <html><head><div class="article-info"> <span c...
1   1           1  <html><head><div class="article-info"><span cl...
2   2           1  <html><head><div class="article-info"><span cl...
3   3          -1  <html><head><div class="article-info"><span cl...
4   4          -1  <html><head><div class="article-info"><span cl...


In [5]:
import re
from bs4 import BeautifulSoup

def preprocessor(text):
    # remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    # regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
    #r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    #emoticons = re.findall(r, text)
    #text = re.sub(r, '', text)
    
    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    #text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    text = re.sub('[\W]+', ' ', text.lower()) + ' '
    return text

## Preprocessing: Stop-Word Removal

In [15]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')
stop = stopwords.words('english')

def tokenizer_stem_nostop(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

print(tokenizer_stem_nostop('runners like running and thus they run'))

['runner', 'like', 'run', 'thu', 'run']


[nltk_data] Downloading package stopwords to C:\Users\stat-
[nltk_data]     pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## BoW (Bag-Of-Words)

In [16]:
import numpy as np
import scipy as sp
from sklearn.feature_extraction.text import CountVectorizer

doc_dummy = ["Study hard, then you will be happy and I will be happy", 
           "\"I'm not happy :(\" \", because you don't study hard"]
print('[example documents]\n{}\n'.format('\n'.join(doc_dummy)))

# ngram_range=(min,max), default: 1-gram => (1,1)
count = CountVectorizer(ngram_range=(1, 1),
                        preprocessor=preprocessor,
                        tokenizer=tokenizer_stem_nostop)

count.fit([df.loc[0,'Page content']])
# dictionary is stored in vocabulary_
BoW = count.vocabulary_
print('[vocabulary]\n{}'.format(BoW))


[example documents]
Study hard, then you will be happy and I will be happy
"I'm not happy :(" ", because you don't study hard

[vocabulary]
{'clara': 33, 'moskowitz': 129, 'space': 181, 'com': 35, 'utc': 209, 'nasa': 131, 'grand': 88, 'challeng': 29, 'stop': 188, 'asteroid': 16, 'destroy': 55, 'earth': 65, 'may': 122, 'killer': 110, 'head': 90, 'decid': 49, 'someth': 180, 'agenc': 4, 'announc': 10, 'new': 132, 'june': 108, 'find': 77, 'danger': 46, 'rock': 163, 'figur': 76, 'planet': 147, 'mission': 124, 'build': 22, 'project': 151, 'alreadi': 6, 'underway': 207, 'includ': 100, 'plan': 146, 'captur': 25, 'pull': 158, 'toward': 204, 'moon': 128, 'send': 172, 'astronaut': 17, 'visit': 213, 'part': 142, 'issu': 106, 'request': 160, 'inform': 102, 'today': 200, 'aim': 5, 'solicit': 178, 'idea': 96, 'industri': 101, 'academia': 1, 'public': 156, 'improv': 99, 'ask': 14, 'think': 196, 'concept': 37, 'differ': 58, 'approach': 12, 'describ': 53, 'william': 219, 'gerstenmai': 84, 'associ': 15, 



In [24]:
# ngram_range=(min,max), default: 1-gram => (1,1)
count = CountVectorizer(ngram_range=(1, 1),
                        preprocessor=preprocessor,
                        tokenizer=tokenizer_stem_nostop)

count.fit([df.loc[0:1,'Page content']])
# dictionary is stored in vocabulary_

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

## TF-IDF

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(ngram_range=(1,1),
                        preprocessor=preprocessor,
                        tokenizer=tokenizer_stem_nostop)


## Feature Hashing

In [18]:
import numpy as np
import scipy as sp
from sklearn.feature_extraction.text import HashingVectorizer

print('[example documents]\n{}\n'.format('\n'.join(doc_dummy)))

# hash words to 1024 buckets
hashvec = HashingVectorizer(n_features=2**10,
                            preprocessor=preprocessor,
                            tokenizer=tokenizer_stem_nostop)


[example documents]
Study hard, then you will be happy and I will be happy
"I'm not happy :(" ", because you don't study hard



In [9]:
import re
from bs4 import BeautifulSoup
soup = BeautifulSoup(df.loc[0,'Page content'], 'html.parser')
soup
print(soup.prettify())

<html>
 <head>
  <div class="article-info">
   <span class="byline basic">
    Clara Moskowitz
   </span>
   for
   <a href="/publishers/space-com/">
    Space.com
   </a>
   <time datetime="Wed, 19 Jun 2013 15:04:30 +0000">
    2013-06-19 15:04:30 UTC
   </time>
  </div>
 </head>
 <body>
  <h1 class="title">
   NASA's Grand Challenge: Stop Asteroids From Destroying Earth
  </h1>
  <figure class="article-image">
   <img class="microcontent" data-fragment="lead-image" data-image="http://i.amz.mshcdn.com/I7b9cUsPSztew7r1WT6_iBLjflo=/950x534/2013%2F06%2F19%2Ffe%2FDactyl.44419.jpg" data-micro="1" data-url="http://mashable.com/2013/06/19/nasa-grand-challenge-asteroid/" src="http://i.amz.mshcdn.com/I7b9cUsPSztew7r1WT6_iBLjflo=/950x534/2013%2F06%2F19%2Ffe%2FDactyl.44419.jpg"/>
  </figure>
  <article data-channel="world">
   <section class="article-content">
    <p>
     There may be killer asteroids headed for Earth, and NASA has decided to do something about it. The space agency announced a 

In [61]:
# 所有的超連結
a_tags = soup.find_all('footer')
for tag in a_tags:
  # 輸出超連結的文字
  print(tag.string)

None


In [76]:

for link in soup.footer('class'):
    print(link.get('href'))

/category/asteroid/
/category/asteroids/
/category/challenge/
/category/earth/
/category/space/
/category/us/
/category/world/


In [91]:
print(soup.h1)

<h1 class="title">NASA's Grand Challenge: Stop Asteroids From Destroying Earth</h1>


# Topics

In [78]:
for i in range(len(soup.footer('a'))):
    print(soup.footer('a')[i].string)

Asteroid
Asteroids
challenge
Earth
Space
U.S.
World


# Time

In [70]:
print(soup.head.time.string)

<footer class="article-topics"> Topics: <a href="/category/asteroid/">Asteroid</a>, <a href="/category/asteroids/">Asteroids</a>, <a href="/category/challenge/">challenge</a>, <a href="/category/earth/">Earth</a>, <a href="/category/space/">Space</a>, <a href="/category/us/">U.S.</a>, <a href="/category/world/">World</a> </footer>
2013-06-19 15:04:30 UTC


# Figure

In [80]:
soup.figure

<figure class="article-image"><img class="microcontent" data-fragment="lead-image" data-image="http://i.amz.mshcdn.com/I7b9cUsPSztew7r1WT6_iBLjflo=/950x534/2013%2F06%2F19%2Ffe%2FDactyl.44419.jpg" data-micro="1" data-url="http://mashable.com/2013/06/19/nasa-grand-challenge-asteroid/" src="http://i.amz.mshcdn.com/I7b9cUsPSztew7r1WT6_iBLjflo=/950x534/2013%2F06%2F19%2Ffe%2FDactyl.44419.jpg"/></figure>

In [84]:
soup.img

<img class="microcontent" data-fragment="lead-image" data-image="http://i.amz.mshcdn.com/I7b9cUsPSztew7r1WT6_iBLjflo=/950x534/2013%2F06%2F19%2Ffe%2FDactyl.44419.jpg" data-micro="1" data-url="http://mashable.com/2013/06/19/nasa-grand-challenge-asteroid/" src="http://i.amz.mshcdn.com/I7b9cUsPSztew7r1WT6_iBLjflo=/950x534/2013%2F06%2F19%2Ffe%2FDactyl.44419.jpg"/>

In [57]:
soup.find_all('p')

[<p>There may be killer asteroids headed for Earth, and NASA has decided to do something about it. The space agency announced a new "Grand Challenge" on June 18 to find all dangerous space rocks and figure out how to stop them from destroying our planet.</p>,
 <p>The new mission builds on projects already underway at NASA, including a plan to <a href="http://www.space.com/20591-nasa-asteroid-capture-mission-feasibility.html" target="_blank">capture an asteroid</a>, pull it in toward the moon and send astronauts to visit it. As part of the Grand Challenge, the agency issued a "request for information" today — aiming to solicit ideas from industry, academia and the public on how to improve the asteroid mission plan.</p>,
 <p>"We're asking for you to think about concepts and different approaches for what we've described here," William Gerstenmaier, NASA's associate administrator for human explorations and operations, said yesterday during a NASA event announcing the initiative. "We want y

In [58]:
soup.find(id="link3")

In [59]:
print(soup.get_text())

 Clara Moskowitz for Space.com 2013-06-19 15:04:30 UTC NASA's Grand Challenge: Stop Asteroids From Destroying Earth There may be killer asteroids headed for Earth, and NASA has decided to do something about it. The space agency announced a new "Grand Challenge" on June 18 to find all dangerous space rocks and figure out how to stop them from destroying our planet. The new mission builds on projects already underway at NASA, including a plan to capture an asteroid, pull it in toward the moon and send astronauts to visit it. As part of the Grand Challenge, the agency issued a "request for information" today — aiming to solicit ideas from industry, academia and the public on how to improve the asteroid mission plan. "We're asking for you to think about concepts and different approaches for what we've described here," William Gerstenmaier, NASA's associate administrator for human explorations and operations, said yesterday during a NASA event announcing the initiative. "We want you to thin