In [57]:
%matplotlib inline
import pandas as pd

df = pd.read_csv('./data/comp1/train.csv')
print(df.head(5))

   Id  Popularity                                       Page content
0   0          -1  <html><head><div class="article-info"> <span c...
1   1           1  <html><head><div class="article-info"><span cl...
2   2           1  <html><head><div class="article-info"><span cl...
3   3          -1  <html><head><div class="article-info"><span cl...
4   4          -1  <html><head><div class="article-info"><span cl...


In [58]:
import re
from bs4 import BeautifulSoup

def preprocessor(text):
    # remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    # regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
    
    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

In [59]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')
stop = stopwords.words('english')

def tokenizer_stem_nostop(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

print(tokenizer_stem_nostop('runners like running and thus they run'))

['runner', 'like', 'run', 'thu', 'run']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [60]:
df_new = pd.read_csv('./data/comp1/train_new_feature.csv')
print(df_new.head(5))

   Unnamed: 0  Id  Popularity            author                 time  \
0           0   0          -1               NaN  2013-06-19 15:04:30   
1           1   1           1  Christina Warren  2013-03-28 17:40:55   
2           2   2           1         Sam Laird  2014-05-07 19:15:20   
3           3   3          -1         Sam Laird  2013-10-11 02:26:50   
4           4   4          -1   Connor Finnegan  2014-04-17 03:31:43   

                                               topic  image num  video num  \
0  ['Asteroid', 'Asteroids', 'challenge', 'Earth'...          1          0   
1  ['Apps and Software', 'Google', 'open source',...          2          0   
2  ['Entertainment', 'NFL', 'NFL Draft', 'Sports'...          2         25   
3       ['Sports', 'Video', 'Videos', 'Watercooler']          1         21   
4  ['Entertainment', 'instagram', 'instagram vide...         52          1   

                                                body  week of day  year  \
0  NASA's Grand Challen

In [61]:
from scipy.sparse import hstack, vstack
import scipy.sparse
import numpy as np
import scipy as sp
from sklearn.feature_extraction.text import HashingVectorizer


def preprocess_new_feature(df_new,sample_size):
    ## body
    doc_body = df_new['body'].iloc[:sample_size]
    hashvec = HashingVectorizer(n_features=2**10,
                                preprocessor=preprocessor,
                                tokenizer=tokenizer_stem_nostop)
    doc_hash = hashvec.transform(doc_body)
    print("body",doc_hash.shape,type(doc_hash))

    ## topic
    ###########
    tmp = df_new['topic'].iloc[:sample_size]
    doc_topic = []
    for i in tmp:
        doc_topic.append(i.replace('[', '').replace(']', '').replace(',', '').replace('\'', ''))
    hashvec = HashingVectorizer(n_features=2**10,
                                preprocessor=preprocessor,
                                tokenizer=tokenizer_stem_nostop)
    doc_hash = hstack([doc_hash,hashvec.transform(doc_topic)])
    ###########
    ###########
    # tmp = df_new['topic'].iloc[:sample_size]
    # topic_set = set()

    # for i in tmp:
    #     #print(i)
    #     items = i.replace('[', '#').replace(']', '#').replace(',', '#').replace('\"', '#').replace('\'', '@').replace(' @', '')
    #     items = items.split('#')
    #     item_tmp = []
    #     for item in items:
    #         item_tmp.append(item.replace('@s', '$').replace('@', '').replace('$', '\'s'))
    #     #print(item_tmp)
    #     topic_set.update(item_tmp)

    # topic_list = list(topic_set)[1:] # [0] is empty: ''

    # doc_topic = []
    # for i in tmp:
    #     items = i.replace('[', '#').replace(']', '#').replace(',', '#').replace('\"', '#').replace('\'', '@').replace(' @', '')
    #     items = items.split('#')
    #     item_tmp = []
    #     for item in items:
    #         item_tmp.append(item.replace('@s', '$').replace('@', '').replace('$', '\'s'))
    #     topic_ids = []
    #     for tag in item_tmp:
    #         if tag != '':
    #             topic_ids.append(topic_list.index(tag))
    #     topic_ids.sort()
    #     doc_topic.append(topic_ids)

    # # print(topic_list)
    # pad = len(max(doc_topic, key=len))
    # doc_topic = np.array([i + [0]*(pad-len(i)) for i in doc_topic])
    # doc_hash = hstack([doc_hash,doc_topic])
    ###########
    print("topic",doc_hash.shape,type(doc_hash))

    # ## time
    # doc_time = df_new['time'].iloc[:100]
    # hashvec = HashingVectorizer(n_features=2**10,
    #                             preprocessor=preprocessor,
    #                             tokenizer=tokenizer_stem_nostop)
    # doc_hash = hstack([doc_hash,hashvec.transform(doc_time)])
    # print(doc_hash.shape,type(doc_hash))

    ## author
    tmp = df_new['author'].replace(np.nan, 'x').iloc[:sample_size]
    unique = list(set(tmp))
    doc_author = []
    for i in tmp:
        doc_author.append(unique.index(i))
    # print(doc_author)
    doc_author = np.array(doc_author)
    doc_hash = hstack([doc_hash,doc_author.reshape((sample_size,1))])
    print("author",doc_hash.shape,type(doc_hash))

    ## img
    doc_img = df_new['image num'].iloc[:sample_size].to_numpy()
    doc_hash =hstack([doc_hash,doc_img.reshape((sample_size,1))])
    print("img num",doc_hash.shape,type(doc_hash))

    ## video
    doc_video = df_new['video num'].iloc[:sample_size].to_numpy()
    doc_hash = hstack([doc_hash,doc_video.reshape((sample_size,1))])
    print("video num",doc_hash.shape,type(doc_hash))

    ## week of day
    doc_w = df_new['week of day'].iloc[:sample_size].to_numpy()
    doc_hash = hstack([doc_hash,doc_w.reshape((sample_size,1))])
    print("week of day",doc_hash.shape,type(doc_hash))

    ## weekend
    doc_w = df_new['week of day'].iloc[:sample_size].to_numpy()
    doc_weekend = []
    for i in doc_w:
        if i > 5:
            doc_weekend.append(1)
        else:
            doc_weekend.append(0)
    doc_weekend = np.array(doc_weekend)
    doc_hash = hstack([doc_hash,doc_weekend.reshape((sample_size,1))])
    print("weekend",doc_hash.shape,type(doc_hash))

    ## month
    doc_m = df_new['month'].iloc[:sample_size].to_numpy()
    doc_hash = hstack([doc_hash,doc_m.reshape((sample_size,1))])
    print("month",doc_hash.shape,type(doc_hash))

    ## hour
    doc_h = df_new['hour'].iloc[:sample_size].to_numpy()
    doc_hash = hstack([doc_hash,doc_h.reshape((sample_size,1))])
    print("hour",doc_hash.shape,type(doc_hash))

    ## year
    doc_y = df_new['year'].iloc[:sample_size].to_numpy()
    doc_hash = hstack([doc_hash,doc_y.reshape((sample_size,1))])
    print("year",doc_hash.shape,type(doc_hash))

    # print(doc_hash.todense())
    return doc_hash

In [62]:
sample_size = 27643 # 100 #
doc_hash = preprocess_new_feature(df_new,sample_size)
doc_y_label = df_new['Popularity'].iloc[:sample_size].to_numpy()
print(doc_y_label.shape)

body (27643, 1024) <class 'scipy.sparse.csr.csr_matrix'>
topic (27643, 2048) <class 'scipy.sparse.coo.coo_matrix'>
author (27643, 2049) <class 'scipy.sparse.coo.coo_matrix'>
img num (27643, 2050) <class 'scipy.sparse.coo.coo_matrix'>
video num (27643, 2051) <class 'scipy.sparse.coo.coo_matrix'>
week of day (27643, 2052) <class 'scipy.sparse.coo.coo_matrix'>
weekend (27643, 2053) <class 'scipy.sparse.coo.coo_matrix'>
month (27643, 2054) <class 'scipy.sparse.coo.coo_matrix'>
hour (27643, 2055) <class 'scipy.sparse.coo.coo_matrix'>
year (27643, 2056) <class 'scipy.sparse.coo.coo_matrix'>
(27643,)


In [None]:
import xgboost
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV


# parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10], 'probability': [True, True]}
# svc = svm.SVC()
# clf = GridSearchCV(svc, parameters, n_jobs=-1, cv=10, scoring='roc_auc')
rf = RandomForestClassifier()
parameters = {'n_estimators':[1,10,50,100]}
clf = GridSearchCV(rf, parameters, n_jobs=-1, cv=10, scoring='roc_auc')

clf.fit(doc_hash, doc_y_label)
print(clf.best_params_, "score: ", clf.best_score_)
print(clf.result_)

# scores = cross_val_score(estimator=RandomForestRegressor(),
#                          X=doc_hash, y=doc_y_label,
#                          cv=10, scoring='roc_auc')
# print('%.3f (+/-%.3f)' % (scores.mean(), scores.std()))

In [64]:
df_new_test = pd.read_csv('./data/comp1/test_feature.csv')
print(df_new_test.head(5),df_new_test.shape)

   Unnamed: 0     Id                                       Page content  \
0           0  27643  <html><head><div class="article-info"><span cl...   
1           1  27644  <html><head><div class="article-info"><span cl...   
2           2  27645  <html><head><div class="article-info"><span cl...   
3           3  27646  <html><head><div class="article-info"><span cl...   
4           4  27647  <html><head><div class="article-info"><span cl...   

           author                 time  \
0       Sam Laird  2013-09-09 19:47:02   
1  Stan Schroeder  2013-10-31 09:25:02   
2  Todd Wasserman  2013-06-25 12:54:54   
3    Neha Prakash  2013-02-13 03:30:21   
4     Josh Dickey  2014-10-03 01:34:54   

                                               topic  image num  video num  \
0  ['Entertainment', 'Music', 'One Direction', 's...          1          7   
1  ['Gadgets', 'glass', 'Google', 'Google Glass',...          3          0   
2  ['amazon', 'amazon kindle', 'Business', 'Gaming']          

In [98]:
doc_hash_test = preprocess_new_feature(df_new_test,sample_size=11847)

body (11847, 1024) <class 'scipy.sparse.csr.csr_matrix'>
topic (11847, 2048) <class 'scipy.sparse.coo.coo_matrix'>
author (11847, 2049) <class 'scipy.sparse.coo.coo_matrix'>
img num (11847, 2050) <class 'scipy.sparse.coo.coo_matrix'>
video num (11847, 2051) <class 'scipy.sparse.coo.coo_matrix'>
week of day (11847, 2052) <class 'scipy.sparse.coo.coo_matrix'>
weekend (11847, 2053) <class 'scipy.sparse.coo.coo_matrix'>
month (11847, 2054) <class 'scipy.sparse.coo.coo_matrix'>
hour (11847, 2055) <class 'scipy.sparse.coo.coo_matrix'>
year (11847, 2056) <class 'scipy.sparse.coo.coo_matrix'>


In [107]:
doc_hash_test_np = doc_hash_test.toarray()

for idx,i in enumerate(doc_hash_test_np):
    for idxx,j in enumerate(i):
        if np.isnan(j):
            print(idx,idxx,j)
            
doc_hash_test_np.data = np.nan_to_num(doc_hash_test_np.data)

for idx,i in enumerate(doc_hash_test_np):
    for idxx,j in enumerate(i):
        if np.isnan(j):
            print(idx,idxx,j)

1585 2051 nan
1585 2053 nan
1585 2054 nan
1585 2055 nan


  doc_hash_test_np.data = np.nan_to_num(doc_hash_test_np.data)


In [127]:
pred = clf.predict_proba(doc_hash_test_np) # predict the proba of -1 and 1
pred

array([[0.45, 0.55],
       [0.53, 0.47],
       [0.59, 0.41],
       ...,
       [0.5 , 0.5 ],
       [0.5 , 0.5 ],
       [0.58, 0.42]])

In [128]:
pred = [round(x[1],1) for x in pred]

In [129]:
df_all_pred = pd.concat([df_new_test['Id'],pd.DataFrame(pred)], axis=1).rename(columns={0: 'Popularity'})
print(f'df_all_pred {df_all_pred.shape}')
df_all_pred.head()

df_all_pred (11847, 2)


Unnamed: 0,Id,Popularity
0,27643,0.6
1,27644,0.5
2,27645,0.4
3,27646,0.6
4,27647,0.5


In [130]:
df_all_pred.to_csv('./data/comp1/pred.csv', index=None)

In [23]:
from scipy.sparse import hstack, vstack
import scipy.sparse
sample_size = 1000 #27643 #100

## body
doc_body = df_new['body'].iloc[:sample_size]
hashvec = HashingVectorizer(n_features=2**10,
                            preprocessor=preprocessor,
                            tokenizer=tokenizer_stem_nostop)
doc_hash = hashvec.transform(doc_body)
print("body",doc_hash.shape,type(doc_hash))

## topic
###########
# tmp = df_new['topic'].iloc[:sample_size]
# doc_topic = []
# for i in tmp:
#     doc_topic.append(i.replace('[', '').replace(']', '').replace(',', '').replace('\'', ''))
# hashvec = HashingVectorizer(n_features=2**10,
#                             preprocessor=preprocessor,
#                             tokenizer=tokenizer_stem_nostop)
# doc_hash = hstack([doc_hash,hashvec.transform(doc_topic)])
###########
###########
tmp = df_new['topic'].iloc[:sample_size]
topic_set = set()

for i in tmp:
    #print(i)
    items = i.replace('[', '#').replace(']', '#').replace(',', '#').replace('\"', '#').replace('\'', '@').replace(' @', '')
    items = items.split('#')
    item_tmp = []
    for item in items:
        item_tmp.append(item.replace('@s', '$').replace('@', '').replace('$', '\'s'))
    #print(item_tmp)
    topic_set.update(item_tmp)

topic_list = list(topic_set)[1:] # [0] is empty: ''

doc_topic = []
for i in tmp:
    items = i.replace('[', '#').replace(']', '#').replace(',', '#').replace('\"', '#').replace('\'', '@').replace(' @', '')
    items = items.split('#')
    item_tmp = []
    for item in items:
        item_tmp.append(item.replace('@s', '$').replace('@', '').replace('$', '\'s'))
    topic_ids = []
    for tag in item_tmp:
        if tag != '':
            topic_ids.append(topic_list.index(tag))
    topic_ids.sort()
    doc_topic.append(topic_ids)

# print(topic_list)
pad = len(max(doc_topic, key=len))
doc_topic = np.array([i + [0]*(pad-len(i)) for i in doc_topic])
doc_hash = hstack([doc_hash,doc_topic])
###########
print("topic",doc_hash.shape,type(doc_hash))

# ## time
# doc_time = df_new['time'].iloc[:100]
# hashvec = HashingVectorizer(n_features=2**10,
#                             preprocessor=preprocessor,
#                             tokenizer=tokenizer_stem_nostop)
# doc_hash = hstack([doc_hash,hashvec.transform(doc_time)])
# print(doc_hash.shape,type(doc_hash))

## author
tmp = df_new['author'].replace(np.nan, 'x').iloc[:sample_size]
unique = list(set(tmp))
doc_author = []
for i in tmp:
    doc_author.append(unique.index(i))
# print(doc_author)
doc_author = np.array(doc_author)
doc_hash = hstack([doc_hash,doc_author.reshape((sample_size,1))])
print("author",doc_hash.shape,type(doc_hash))

## img
doc_img = df_new['image num'].iloc[:sample_size].to_numpy()
doc_hash =hstack([doc_hash,doc_img.reshape((sample_size,1))])
print("img num",doc_hash.shape,type(doc_hash))

## video
doc_video = df_new['video num'].iloc[:sample_size].to_numpy()
doc_hash = hstack([doc_hash,doc_video.reshape((sample_size,1))])
print("video num",doc_hash.shape,type(doc_hash))

## week of day
doc_w = df_new['week of day'].iloc[:sample_size].to_numpy()
doc_hash = hstack([doc_hash,doc_w.reshape((sample_size,1))])
print("week of day",doc_hash.shape,type(doc_hash))

## weekend
doc_w = df_new['week of day'].iloc[:sample_size].to_numpy()
doc_weekend = []
for i in doc_w:
    if i > 5:
        doc_weekend.append(1)
    else:
        doc_weekend.append(0)
doc_weekend = np.array(doc_weekend)
doc_hash = hstack([doc_hash,doc_weekend.reshape((sample_size,1))])
print("weekend",doc_hash.shape,type(doc_hash))

## month
doc_m = df_new['month'].iloc[:sample_size].to_numpy()
doc_hash = hstack([doc_hash,doc_m.reshape((sample_size,1))])
print("month",doc_hash.shape,type(doc_hash))

## hour
doc_h = df_new['hour'].iloc[:sample_size].to_numpy()
doc_hash = hstack([doc_hash,doc_h.reshape((sample_size,1))])
print("hour",doc_hash.shape,type(doc_hash))

## year
doc_y = df_new['year'].iloc[:sample_size].to_numpy()
doc_hash = hstack([doc_hash,doc_y.reshape((sample_size,1))])
print("year",doc_hash.shape,type(doc_hash))

# print(doc_hash.todense())

body (1000, 1024) <class 'scipy.sparse.csr.csr_matrix'>
topic (1000, 1041) <class 'scipy.sparse.coo.coo_matrix'>
author (1000, 1042) <class 'scipy.sparse.coo.coo_matrix'>
img num (1000, 1043) <class 'scipy.sparse.coo.coo_matrix'>
video num (1000, 1044) <class 'scipy.sparse.coo.coo_matrix'>
week of day (1000, 1045) <class 'scipy.sparse.coo.coo_matrix'>
weekend (1000, 1046) <class 'scipy.sparse.coo.coo_matrix'>
month (1000, 1047) <class 'scipy.sparse.coo.coo_matrix'>
hour (1000, 1048) <class 'scipy.sparse.coo.coo_matrix'>
year (1000, 1049) <class 'scipy.sparse.coo.coo_matrix'>


In [24]:
doc_y_label = df_new['Popularity'].iloc[:sample_size].to_numpy()
print(doc_y_label.shape)

(1000,)


In [33]:
import xgboost
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

scores = cross_val_score(estimator=RandomForestRegressor(),
                         X=doc_hash, y=doc_y_label,
                         cv=10, scoring='roc_auc')
print('%.3f (+/-%.3f)' % ( scores.mean(), scores.std()))

0.580 (+/-0.052)


In [31]:
# check xgboost version
import xgboost as xgb
print(xgboost.__version__)
model = xgb.XGBRegressor()

1.3.3
