In [2]:
import numpy as np
import pandas as pd
data = pd.read_csv("reddit_worldnews_start_to_2016-11-22.csv", encoding='latin-1')

# Labeling

We are labeling the post as popular or not according to the number of up votes.

Since we see an increasing number of up votes over year, we add "popular" label based on whether the up votes are in the top 5% percentile in every year's posts. 

In [3]:
data['year']='2008'
for i in data.index:
    data.at[i,'year'] = data.at[i,'date_created'][:4]

In [4]:
# temp is a dict that contains the 95% cutoff of each year
temp={}
for i in range(2008,2017):
    temp[str(i)] = np.percentile(data[data['year']==str(i)]['up_votes'],95)

In [5]:
data['label']=0
for i in data.index:
    if data.at[i,'up_votes'] >= temp[data.at[i,'year']]:
        data.at[i,'label']=1

# Text Preprocessing

In [6]:
# load in spacy
import en_core_web_md
import spacy
from scipy.spatial.distance import cosine
nlp = en_core_web_md.load()

# Preprocess the reviews (tokenizing, lemmatization, removing stopwords)
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string

stop_words = set(stopwords.words('english'))

lemmatizer = WordNetLemmatizer()

def preprocessing(titles):
    filtered_titles = []
    for title in titles:
        title = title.lower()
        token_list = word_tokenize(title) # Tokenize
        filtered_token = [t for t in token_list if not t in stop_words] # Remove stopwords
        for i in range(len(filtered_token)):
            filtered_token[i] = lemmatizer.lemmatize(filtered_token[i]).strip(string.punctuation) # Lemmatization
        filtered_titles.append(" ".join(filtered_token))
    return filtered_titles

# TF-IDF weighted word2vec

In [7]:
# TF-IDF vectorizer
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
# data
filtered_corpus = preprocessing(data["title"])
vectorizer = TfidfVectorizer(ngram_range=(1,1),
                             token_pattern=r'\b[a-zA-Z]{3,}\b',
                             max_df = 0.4, max_features = 2000) # only use first 2000 features because of 
                                                                # computatioal complexity later on

# vectorize the corpus
vector = vectorizer.fit_transform(filtered_corpus)

In [8]:
# TF-IDF matrix
tfidf_matrix = pd.DataFrame(vector.toarray(), columns = vectorizer.get_feature_names())
# Word embeddings for each word in the column index of TF-IDF matrix
word2vec = [np.array(nlp(i).vector) for i in tfidf_matrix.columns]
# For each title, use each word's TF-IDF mutliply by its word embeddings vector and sum all the word vectors
# The result is an unweighted matrix for each title
unweighted_matrix = pd.DataFrame(np.dot(tfidf_matrix,np.array(word2vec)))
unweighted_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.91206,0.486792,0.476928,0.289893,0.05477,-0.870862,-0.268372,0.243633,0.6386,3.758917,...,0.14754,-0.250236,-0.509609,0.120965,0.368058,0.258011,-0.647812,-0.213497,0.221647,0.039148
1,-0.324209,-0.390879,-0.141281,0.468408,0.106328,0.084631,-0.485883,0.411318,0.092061,3.285786,...,0.56811,0.480141,0.255341,0.012618,-0.005795,-0.604799,-0.348221,0.065231,-0.195351,-0.35877
2,-0.392182,0.212705,0.292572,-0.265595,0.723275,-0.399778,-0.584884,0.070501,-0.410889,2.268124,...,0.122901,0.364871,-0.286764,-0.4559,0.556184,-0.654148,-0.201398,0.354193,-0.320886,0.65889
3,-0.49549,0.646893,-0.114675,-0.209844,-0.49477,-0.81525,-0.022099,-0.126436,0.217735,6.272498,...,-0.395701,0.316228,-0.125113,-0.427662,0.136894,-0.184926,-0.489197,-0.181203,0.02129,0.072146
4,-0.521825,0.040879,0.59142,0.060628,0.589623,0.026628,-0.254125,0.444144,-0.203941,4.084317,...,-0.188162,0.022798,0.389346,-0.072245,-0.124132,0.184864,0.015786,0.14774,-0.223388,0.561387


In [9]:
# For each title, use unweighted matrix divided by the sum of that title's TF-IDF to get weighted word2vec matrix
# The result is our final word2vec matrix
final_w2v = unweighted_matrix.div(tfidf_matrix.sum(axis=1), axis=0)
final_w2v = final_w2v.fillna(0)
final_w2v.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.465222,0.248302,0.243271,0.147868,0.027937,-0.444208,-0.136891,0.124272,0.325736,1.917342,...,0.075257,-0.12764,-0.259941,0.061702,0.187738,0.131606,-0.330435,-0.1089,0.113058,0.019968
1,-0.1895,-0.228469,-0.082579,0.273784,0.062149,0.049467,-0.283999,0.240415,0.05381,1.920541,...,0.33206,0.280642,0.149247,0.007375,-0.003387,-0.353505,-0.203535,0.038127,-0.114182,-0.209701
2,-0.197132,0.106917,0.147062,-0.133502,0.363557,-0.20095,-0.293994,0.035438,-0.206535,1.14008,...,0.061777,0.183404,-0.144143,-0.22916,0.279568,-0.328809,-0.101234,0.178036,-0.161294,0.331193
3,-0.222323,0.290256,-0.051454,-0.094156,-0.222,-0.365797,-0.009915,-0.056731,0.097696,2.814425,...,-0.177548,0.141889,-0.056137,-0.191889,0.061423,-0.082975,-0.219499,-0.081305,0.009553,0.032371
4,-0.301983,0.023657,0.342258,0.035086,0.341218,0.01541,-0.147064,0.257029,-0.118022,2.363621,...,-0.108891,0.013193,0.225317,-0.041809,-0.071836,0.106982,0.009136,0.085498,-0.129276,0.324879


# PCA to reduce dimensionality

In [10]:
from sklearn.decomposition import PCA

# keep 80% of original information
pca = PCA(n_components = 0.8)
pca_features = pca.fit_transform(np.array(final_w2v))
pca_df = pd.DataFrame(pca_features)

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    pca_df,
    data.iloc[:,-1],
    test_size=0.3,
    random_state=1)

In [13]:
X_train.shape

(356465, 88)

In [12]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc

  from numpy.core.umath_tests import inner1d


In [28]:
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    gb = GradientBoostingClassifier(n_estimators=X_train.shape[1], learning_rate = learning_rate, max_features=2, max_depth = 2, random_state = 0)
    gb.fit(X_train, y_train)
    
    train_prob=gb.predict_proba(X_train)
    test_prob=gb.predict_proba(X_test)
    train_df = pd.DataFrame(train_prob,index=y_train.index,columns = ['prob', 'y'])
    train_df['y']=y_train
    test_df = pd.DataFrame(test_prob,index=y_test.index,columns = ['prob', 'y'])
    test_df['y']=y_test
    
    #k=0
    #cur=0
    #num=int(0.05*len(train_df.index))
    #for i in train_df.sort_values(by='prob').index:
        #if cur<num:
            #cur+=1
            #k+=train_df.loc[i,'y']
    # train
    print("Learning rate: ", learning_rate)
    #print("Detection Rate at 5% (training): {0:.3f}".format(k/sum(train_df['y']))
          
    p=0
    cur=0
    num=int(0.05*len(test_df.index))
    for i in test_df.sort_values(by='prob').index:
        if cur<num:
            cur+=1
            p+=test_df.loc[i,'y']
    # test
    rate=p/sum(test_df['y'])
    print("Detection Rate at 5% (testing): {0:.3f}".format(rate))

Learning rate:  0.05
Detection Rate at 5% (testing): 0.123
Learning rate:  0.1
Detection Rate at 5% (testing): 0.126
Learning rate:  0.25
Detection Rate at 5% (testing): 0.134
Learning rate:  0.5
Detection Rate at 5% (testing): 0.135
Learning rate:  0.75
Detection Rate at 5% (testing): 0.130
Learning rate:  1
Detection Rate at 5% (testing): 0.130


The best learning rate is 0.5 when looking at testing detection rate at 5%

# kNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
neighbors = np.arange(1,9)
train_accuracy =np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))

for i,k in enumerate(neighbors):
    #Setup a knn classifier with k neighbors
    knn = KNeighborsClassifier(n_neighbors=k)
    
    #Fit the model
    knn.fit(X_train, y_train)
    
    train_prob=knn.predict_proba(X_train)
    test_prob=knn.predict_proba(X_test)
    train_df = pd.DataFrame(train_prob,index=y_train.index,columns = ['prob', 'y'])
    train_df['y']=y_train
    test_df = pd.DataFrame(test_prob,index=y_test.index,columns = ['prob', 'y'])
    test_df['y']=y_test
    
    p=0
    cur=0
    num=int(0.05*len(test_df.index))
    for i in test_df.sort_values(by='prob').index:
        if cur<num:
            cur+=1
            p+=test_df.loc[i,'y']
    # test
    rate=p/sum(test_df['y'])
    print("Number of Neighbors: ", k)
    print("Detection Rate at 5% (testing): {0:.3f}".format(rate))

# DNN

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import keras
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
#unit = number of neurons in the layer
clf1 = Sequential([
    Dense(units=9, kernel_initializer='normal', input_dim=howManySelectedFeaturesPutIntoModel, activation='relu'),
    Dense(units=14, kernel_initializer='normal', activation='softmax'),
    Dropout(0.25),
    Dense(1, kernel_initializer='normal', activation='sigmoid')
])
clf1.summary()