In [1]:
import numpy as np
import pandas as pd
data = pd.read_csv("reddit_worldnews_start_to_2016-11-22.csv", encoding='latin-1')

In [None]:
data['year']='2008'
for i in data.index:
    data.at[i,'year'] = data.at[i,'date_created'][:4]

In [None]:
# temp is a dict that contains the 95% cutoff of each year
temp={}
for i in range(2008,2017):
    temp[str(i)] = np.percentile(data[data['year']==str(i)]['up_votes'],95)

In [None]:
data['label']=0
for i in data.index:
    if data.at[i,'up_votes'] >= temp[data.at[i,'year']]:
        data.at[i,'label']=1

In [None]:
# load in spacy
import en_core_web_md
import spacy
from scipy.spatial.distance import cosine
nlp = en_core_web_md.load()

# Preprocess the reviews (tokenizing, lemmatization, removing stopwords)
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string

stop_words = set(stopwords.words('english'))

lemmatizer = WordNetLemmatizer()

def preprocessing(titles):
    filtered_titles = []
    for title in titles:
        title = title.lower()
        token_list = word_tokenize(title) # Tokenize
        filtered_token = [t for t in token_list if not t in stop_words] # Remove stopwords
        for i in range(len(filtered_token)):
            filtered_token[i] = lemmatizer.lemmatize(filtered_token[i]).strip(string.punctuation) # Lemmatization
        filtered_titles.append(" ".join(filtered_token))
    return filtered_titles

In [None]:
# TF-IDF vectorizer
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
# data
filtered_corpus = preprocessing(data["title"])
vectorizer = TfidfVectorizer(ngram_range=(1,1),
                             token_pattern=r'\b[a-zA-Z]{3,}\b',
                             max_df = 0.4, max_features = 2000) # only use first 2000 features because of 
                                                                # computatioal complexity later on

# vectorize the corpus
vector = vectorizer.fit_transform(filtered_corpus)

In [None]:
# TF-IDF matrix
tfidf_matrix = pd.DataFrame(vector.toarray(), columns = vectorizer.get_feature_names())
# Word embeddings for each word in the column index of TF-IDF matrix
word2vec = [np.array(nlp(i).vector) for i in tfidf_matrix.columns]
# For each title, use each word's TF-IDF mutliply by its word embeddings vector and sum all the word vectors
# The result is an unweighted matrix for each title
unweighted_matrix = pd.DataFrame(np.dot(tfidf_matrix,np.array(word2vec)))
unweighted_matrix.head()

In [None]:
# For each title, use unweighted matrix divided by the sum of that title's TF-IDF to get weighted word2vec matrix
# The result is our final word2vec matrix
final_w2v = unweighted_matrix.div(tfidf_matrix.sum(axis=1), axis=0)
final_w2v = final_w2v.fillna(0)
final_w2v.head()

In [None]:
from sklearn.decomposition import PCA

# keep 90% of original information
pca = PCA(n_components = 0.8)
pca_features = pca.fit_transform(np.array(final_w2v))
pca_df = pd.DataFrame(pca_features)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    pca_df,
    data.iloc[:,-1],
    test_size=0.3,
    random_state=1)