In [5]:
import numpy as np
import pandas as pd
data = pd.read_csv("reddit_worldnews_start_to_2016-11-22.csv", encoding='latin-1')
data = data.sample(5000)

In [6]:
data.head()

Unnamed: 0,time_created,date_created,up_votes,down_votes,title,over_18,author,subreddit
285199,1403789552,2014-06-26,0,0,Pakistanis fleeing North Waziristan demand ans...,False,r4816,worldnews
86332,1304917313,2011-05-09,850,0,Ecuador bans 500-yr-old tradition of bullfight...,False,maxwellhill,worldnews
131197,1344633757,2012-08-10,2,0,Officials need help finding missing Olympians,False,CarlSagginPants,worldnews
389415,1439551371,2015-08-14,35,0,Some Isolated Tribes in the Amazon Are Initiat...,False,anutensil,worldnews
113067,1328187791,2012-02-02,17,0,London Metropolitan Police inadvertently sha...,False,anutensil,worldnews


## Create Label

In [7]:
data['year']='2008'
for i in data.index:
    data.at[i,'year'] = data.at[i,'date_created'][:4]
    
# temp is a dict that contains the 95% cutoff of each year
temp={}
for i in range(2008,2017):
    temp[str(i)] = np.percentile(data[data['year']==str(i)]['up_votes'],95)

data['label']=0
for i in data.index:
    if data.at[i,'up_votes'] >= temp[data.at[i,'year']]:
        data.at[i,'label']=1

## Title Preprocessing

In [8]:
# load in spacy
import en_core_web_md
import spacy
from scipy.spatial.distance import cosine
nlp = en_core_web_md.load()

# Preprocess the reviews (tokenizing, lemmatization, removing stopwords)
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string

stop_words = set(stopwords.words('english'))

lemmatizer = WordNetLemmatizer()

def preprocessing(titles):
    filtered_titles = []
    for title in titles:
        title = title.lower()
        token_list = word_tokenize(title) # Tokenize
        filtered_token = [t for t in token_list if not t in stop_words] # Remove stopwords
        for i in range(len(filtered_token)):
            filtered_token[i] = lemmatizer.lemmatize(filtered_token[i]).strip(string.punctuation) # Lemmatization
        filtered_titles.append(" ".join(filtered_token))
    return filtered_titles

## TF-IDF Weighted Word2Vec

In [9]:
# TF-IDF vectorizer
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
# data
filtered_corpus = preprocessing(data["title"])
vectorizer = TfidfVectorizer(ngram_range=(1,1),
                             token_pattern=r'\b[a-zA-Z]{3,}\b',
                             max_df = 0.4, 
                             max_features = 2000) # only use first 2000 features because of 
                                                  # computatioal complexity later on

# vectorize the corpus
vector = vectorizer.fit_transform(filtered_corpus)

In [10]:
# TF-IDF matrix
tfidf_matrix = pd.DataFrame(vector.toarray(), columns = vectorizer.get_feature_names(), index = data.index)
# Word embeddings for each word in the column index of TF-IDF matrix
word2vec = [np.array(nlp(i).vector) for i in tfidf_matrix.columns]
# For each title, use each word's TF-IDF mutliply by its word embeddings vector and sum all the word vectors
# The result is an unweighted matrix for each title
unweighted_matrix = pd.DataFrame(np.dot(tfidf_matrix,np.array(word2vec)), index = tfidf_matrix.index)
#unweighted_matrix.head()

In [11]:
# For each title, use unweighted matrix divided by the sum of that title's TF-IDF to get weighted word2vec matrix
# The result is our final word2vec matrix
final_w2v = unweighted_matrix.div(tfidf_matrix.sum(axis=1), axis=0)
final_w2v = final_w2v.fillna(0)
#final_w2v.head()

In [12]:
final_w2v["label"] = np.array(data["label"])

In [13]:
from sklearn.model_selection import train_test_split

reddit_train, reddit_test, y_train, y_test = train_test_split(
    final_w2v,
    final_w2v["label"],
    test_size=0.3,
    random_state=1)

## Create New Variables -- Author_Value

In [14]:
reddit_train = reddit_train.drop(columns = ["label"])
reddit_test = reddit_test.drop(columns = ["label"])

In [15]:
data_train = data.loc[list(reddit_train.index),:]
data_test = data.loc[list(reddit_test.index),:]

In [16]:
data_author_y = data_train[["author", "label"]]
author_mean = data_author_y.groupby(by = "author").mean()

In [17]:
data_train['author_value'] = data_train['author'].map(author_mean['label'])
author_value_mean = data_train['author_value'].mean()
data_test['author_value'] = data_test['author'].map(author_mean['label'])
data_test = data_test.fillna(author_value_mean)

In [18]:
reddit_train["author_value"] = np.array(data_train["author_value"])
reddit_test["author_value"] = np.array(data_test["author_value"])

In [19]:
final_variables = pd.DataFrame(np.concatenate((reddit_train,reddit_test)), columns = reddit_train.columns)

In [20]:
train_row = len(reddit_train)
test_row = len(reddit_test)

## PCA

In [21]:
from sklearn.decomposition import PCA

# keep 90% of original information
pca = PCA(n_components = 0.8)
pca_features = pca.fit_transform(np.array(final_variables))
pca_df = pd.DataFrame(pca_features)

In [22]:
pca_df.shape

(5000, 85)

In [23]:
X_train = pca_df[:int(train_row)]
X_test = pca_df[-int(test_row):]

## Logistic Regression

In [33]:
from sklearn.linear_model import LogisticRegression

re = LogisticRegression()
re.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [34]:
y_pred = re.predict(X_test)

In [35]:
re.score(X_train, y_train)

0.9808571428571429

In [36]:
re.score(X_test, y_test)

0.9433333333333334

In [37]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pred)

0.5156154456927472

## FDR

In [38]:
train_prob=re.predict_proba(X_train)
test_prob=re.predict_proba(X_test)

In [39]:
train_df = pd.DataFrame(train_prob,index=y_train.index,columns = ['prob', 'y'])
train_df['y']=y_train
test_df = pd.DataFrame(test_prob,index=y_test.index,columns = ['prob', 'y'])
test_df['y']=y_test

In [40]:
k=0
cur=0
num=int(0.05*len(train_df.index))
for i in train_df.sort_values(by='prob').index:
    if cur<num:
        cur+=1
        k+=train_df.loc[i,'y']
# train
k/sum(train_df['y'])

0.797752808988764

In [41]:
k=0
cur=0
num=int(0.05*len(test_df.index))
for i in test_df.sort_values(by='prob').index:
    if cur<num:
        cur+=1
        k+=test_df.loc[i,'y']
# test
k/sum(test_df['y'])

0.06493506493506493