In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
import time
import datetime
from collections import Counter

df = pd.read_csv('../NYTimesBlogTrain.csv')

y = df['Popular'] 

# convert date to timestamp (milliseconds, divide by a million to get a more readable decimal)
df['PubDate'] = df['PubDate'].apply(lambda x: time.mktime(time.strptime(x, '%Y-%m-%d %H:%M:%S'))/1000000)

#replace empty values with the middle most common word

def nan_to_meaningful(x, col):
    if pd.isnull(x):
        mc = Counter(col).most_common(8)
        mc = [str(i[0]) for i in mc]
        if 'nan' in mc:
            mc.remove('nan')
        rep = mc.pop((len(mc))//2)
        return rep
        
    return x
        
my_list = ["NewsDesk","SectionName","SubsectionName","Headline","Snippet","Abstract"]

for f in my_list: 
    df[f] = df[f].apply(nan_to_meaningful, args=(df[f],))

# split sentences into series of words so each word can be a column
def string_to_series_of_words(x):    
    return pd.Series(x.split(" "))


for f in my_list[3:]:
    df[f] = df[f].apply(string_to_series_of_words)
    
    
df=pd.get_dummies(df,columns=["NewsDesk","SectionName","SubsectionName", "Headline","Snippet","Abstract"],
                  drop_first=False)

new_features = list(df.dtypes.index)
new_features.remove('Popular')
new_features.remove('UniqueID')

X = df[new_features]

#print(X)





      WordCount      PubDate  NewsDesk_Business  NewsDesk_Culture  \
0           508  1409.634009                  1                 0   
1           285  1409.631247                  0                 1   
2          1211  1409.630736                  1                 0   
3          1405  1409.629414                  1                 0   
4           181  1409.623131                  0                 0   
5           245  1409.622742                  0                 0   
6           258  1409.621570                  0                 0   
7           893  1409.620652                  0                 0   
8          1077  1409.618059                  0                 0   
9           188  1409.615926                  0                 1   
10         3350  1409.615311                  0                 0   
11           97  1409.607583                  0                 0   
12          159  1409.607449                  0                 1   
13          409  1409.606590      

In [None]:
# Guassian Naive Bayes Classifier

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)


from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_predict_gaus = gnb.predict(X_test)
accuracy_gaus = accuracy_score(y_test, y_predict_gaus)

acc_train_g = gnb.score(X_train, y_train)


print("Accuracy on training data for Gaussian Naive Bayes:", accuracy_gaus)

Accuracy on training data for Gaussian Naive Bayes: 0.567278287462


In [None]:
# MLP Classifier

In [16]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(alpha = 1)
mlp.fit(X_train, y_train)
y_predict_mlp = mlp.predict(X_test)
accuracy_mlp = accuracy_score(y_test, y_predict_mlp)
print("Accuracy MLP Classifier:", accuracy_mlp)
print()

Accuracy MLP Classifier: 0.848623853211



In [None]:
# Gradient Boosting Classifier

In [11]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
y_predict_grad = gbc.predict(X_test)
accuracy_etc = accuracy_score(y_test, y_predict_grad)

print("Accuracy Gradient Boosting Classifier:", accuracy_etc)
print()


Accuracy Gradient Boosting Classifier: 0.903669724771



In [None]:
# All Classifier Accuracies Combined

In [15]:

combined_predictions=pd.DataFrame([y_predict_gaus, y_predict_mlp, y_predict_grad])

consolidated_predictions = list(combined_predictions.mode().values[0])

acc= accuracy_score(y_test, consolidated_predictions)

print("\n Combined Accuracy: ", acc) 







 Combined Accuracy:  0.892966360856
