In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import AdaBoostClassifier, VotingClassifier, RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelBinarizer, LabelBinarizer
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

import nltk
import gensim
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.parsing.preprocessing import remove_stopwords, strip_non_alphanum, strip_short, strip_numeric
from gensim.models import Word2Vec, KeyedVectors
from imblearn.over_sampling import ADASYN
from collections import Counter
from sklearn.decomposition import SparsePCA

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
def get_auc(y_true,y_pred):

    binary = LabelBinarizer().fit(y_true)
    y_true = binary.transform(y_true)
    y_pred = binary.transform([abs(round(value)) for value in y_pred])
    return roc_auc_score(y_true, y_pred, average='weighted')

In [None]:
df = pd.read_csv('reduced_df.csv')
article_text = df['article_text'].apply(lambda x: re.sub(r'http\S+', '', str(x)))

In [None]:
text = article_text.apply(lambda x: strip_short(remove_stopwords(strip_numeric(
                            strip_non_alphanum(x.lower()))), minsize=3))
mid_5th = text.apply(lambda x: x[round(0.4*len(x)):round(0.6*len(x))])

In [None]:
tfidf = TfidfVectorizer(max_features=2500)
vectors = tfidf.fit_transform(text)
encoder = preprocessing.LabelEncoder()
labels = encoder.fit_transform(df.norm_score)
vocab = dict(zip(tfidf.vocabulary_.values(),tfidf.vocabulary_.keys()))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(vectors, labels, test_size=0.1, random_state=42)
X_train_rs, y_train_rs = ADASYN(n_jobs=4, random_state=42).fit_resample(X_train, y_train)

In [None]:
X_train_rs.shape

In [None]:
sns.countplot(y_train_rs,palette="husl").set_title("Label Distribution")
print(sorted(Counter(y_train_rs).items()))

In [None]:
sns.countplot(y_test,palette="husl")

## Linear Regression

In [None]:
reg = LinearRegression()
reg.fit(X_train_rs, y_train_rs)
reg_pred = reg.predict(X_test)

print("MSE: {}".format(mean_squared_error(y_test, reg_pred)))
sns.distplot(reg_pred).set_title("Prediction Distribtution for Regression")

print("Overall Accuracy: {0:.2f}".format(accuracy_score(y_test,[abs(round(value)) for value in reg_pred])))
print("AUC: {0:.2f}".format(get_auc(y_test, reg_pred)))
print(confusion_matrix(y_test,[round(value) for value in reg_pred]))

## Lasso Regression

In [None]:
lasso = Lasso(random_state=42)
alpha = np.logspace(-3, 0.0, 10)
param_grid = {'alpha': alpha, 'max_iter': [5000]}

lasso_search = GridSearchCV(Lasso(), param_grid, n_jobs=-1, cv=3)
lasso_search.fit(X_train_rs, y_train_rs)

f, ax = plt.subplots()
ax.set(xscale='log')
sns.lineplot(np.logspace(-3, 0.0, 10), lasso_search.cv_results_['mean_test_score'])
print(lasso_search.best_params_)

In [None]:
lasso = Lasso(lasso_search.best_params_['alpha'], max_iter=5000)
lasso.fit(X_train_rs, y_train_rs)
lasso_pred = lasso.predict(X_test)

print("MSE: {}".format(mean_squared_error(y_test, lasso_pred)))
print("Non-zero coefficients: {}".format(np.sum(abs(lasso.coef_) != 0)))
sns.distplot(lasso_pred).set_title("Prediction Distribtution for Lasso")

print("Overall Accuracy: {0:.2f}".format(accuracy_score(y_test,[abs(round(value)) for value in lasso_pred])))
print("AUC: {0:.2f}".format(get_auc(y_test,pred)))
print(confusion_matrix(y_test,[round(value) for value in lasso_pred]))

In [None]:
coef = np.abs(lasso.coef_)
sorted_coef = np.argsort(coef)

for idx in reversed(sorted_coef[-15:]):
    print(vocab[idx])

## Ridge Regression

In [None]:
alpha = np.logspace(-3.0, 1.0, 10)
param_grid = {'alpha': alpha}

ridge_search = GridSearchCV(Ridge(), param_grid, n_jobs=-1, cv=5)
ridge_search.fit(X_train_rs, y_train_rs)

f, ax = plt.subplots()
ax.set(xscale='log')
sns.lineplot(np.logspace(-5, 0.0, 10), ridge_search.cv_results_['mean_test_score'])
print(ridge_search.best_params_)

In [None]:
ridge = Ridge(alpha=ridge_search.best_params_['alpha'], max_iter=10000)
ridge.fit(X_train_rs, y_train_rs)
ridge_pred = ridge.predict(X_test)

print("MSE: {}".format(mean_squared_error(y_test, ridge_pred)))
sns.distplot(ridge_pred).set_title("Prediction Distribtution for Ridge")

print("Overall Accuracy: {0:.2f}".format(accuracy_score(y_test,[abs(round(value)) for value in ridge_pred])))
print("AUC: {0:.2f}".format(get_auc(y_test, ridge_pred)))
print(confusion_matrix(y_test,[round(value) for value in ridge_pred]))

## With Non-zero features 

In [None]:
idx = np.nonzero(lasso.coef_)[0]
X_train_sparse = pd.SparseDataFrame(X_train_rs, default_fill_value=0)
X_test_sparse = pd.SparseDataFrame(X_test, default_fill_value=0)
reduced = X_train_sparse[idx]
reduced_test = X_test_sparse[idx]

In [None]:
reg = LinearRegression(normalize=True)
reg.fit(reduced, y_train_rs)
reg_pred = reg.predict(reduced_test)

print("MSE: {}".format(mean_squared_error(y_test, reg_pred)))
sns.distplot(reg_pred).set_title("Prediction Distribtution for Regression")

print("Overall Accuracy: {0:.2f}".format(accuracy_score(y_test,[abs(round(value)) for value in reg_pred])))
print("AUC: {0:.2f}".format(get_auc(y_test, reg_pred)))
print(confusion_matrix(y_test,[round(value) for value in reg_pred]))

## XGBoost

In [None]:
boosted = XGBClassifier(nthread=12,reg_lambda=0.75,max_depth=5)
boosted.fit(X_train_rs, y_train_rs)
boosted_pred = boosted.predict(X_test)

print("Cross val score: {}".format(cross_val_score(boosted, X_train_rs, y_train_rs, cv=3).mean()))
print("Test score: {0:.2f}".format(boosted.score(X_test, y_test)))
print(confusion_matrix(y_test, boosted_pred))

print("AUC: {0:.2f}".format(get_auc(y_test, boosted_pred)))
print(classification_report(y_test, boosted_pred))

In [None]:
features = np.argsort(boosted.feature_importances_)

for idx in reversed(features[-15:]):
    print(vocab[idx])

### Using Lasso Parameters

In [None]:
boosted = XGBClassifier(nthread=12,reg_lambda=0.75,max_depth=5)
boosted.fit(reduced, y_train_rs)
boosted_pred = boosted.predict(reduced_test)

print("Cross val score: {}".format(cross_val_score(boosted, reduced, y_train_rs, cv=3).mean()))
print("Test score: {0:.2f}".format(boosted.score(reduced_test, y_test)))
print(confusion_matrix(y_test, boosted_pred))

print("AUC: {0:.2f}".format(get_auc(y_test, boosted_pred)))
print(classification_report(y_test, boosted_pred))

### Using SelectKBest

In [None]:
k_best = SelectKBest(score_func=f_classif, k = 200).fit(X_train_rs,y_train_rs)

X_best_train = k_best.transform(X_train_rs)
X_best_test = k_best.transform(X_test)

In [None]:
k_best_idx = k_best.get_support(indices=True)

for idx in k_best_idx[:15]:
    print(vocab[idx])

In [None]:
boosted_k = XGBClassifier(nthread=12,reg_lambda=0.75,max_depth=5)
boosted_k.fit(X_best_train, y_train_rs)
boosted_pred_k = boosted_k.predict(X_best_test)

print("Cross val score: {}".format(cross_val_score(boosted_k, X_best_train, y_train_rs, cv=3).mean()))
print("Test score: {0:.2f}".format(boosted_k.score(X_best_test, y_test)))
print(confusion_matrix(y_test, boosted_pred_k))

print("AUC: {0:.2f}".format(get_auc(y_test, boosted_pred_k)))
print(classification_report(y_test, boosted_pred_k))


## LinearSVC

In [None]:
svc = LinearSVC(max_iter=2000, dual=False, random_state=1, C=0.25)
svc.fit(X_train_rs, y_train_rs)
svc_pred = svc.predict(X_test)

print("Cross val score: {}".format(cross_val_score(svc, X_train_rs, y_train_rs, cv=3).mean()))
print("Test score: {0:.2f}".format(svc.score(X_test, y_test)))
print(confusion_matrix(y_test, svc_pred))

print("AUC: {0:.2f}".format(get_auc(y_test, svc_pred)))
print(classification_report(y_test, svc_pred))

### Using Lasso Parameters

In [None]:
svc = LinearSVC(max_iter=2000, dual=False, random_state=1, C=0.25)
svc.fit(reduced, y_train_rs)
svc_pred = svc.predict(reduced_test)

print("Cross val score: {}".format(cross_val_score(svc, reduced, y_train_rs, cv=3).mean()))
print("Test score: {0:.2f}".format(svc.score(reduced_test, y_test)))
print(confusion_matrix(y_test, svc_pred))

print("AUC: {0:.2f}".format(get_auc(y_test, svc_pred)))
print(classification_report(y_test, svc_pred))

### Using SelectKBest

In [None]:
svc_k = LinearSVC(max_iter=2000, dual=False, random_state=1, C=0.25)
svc_k.fit(X_best_train, y_train_rs)
svc_pred_k = svc_k.predict(X_best_test)

print("Cross val score: {}".format(cross_val_score(svc_k, X_best_train, y_train_rs, cv=3).mean()))
print("Test score: {0:.2f}".format(svc_k.score(X_best_test, y_test)))
print(confusion_matrix(y_test, boosted_pred_k))

print("AUC: {0:.2f}".format(get_auc(y_test, svc_pred_k)))
print(classification_report(y_test, svc_pred_k))

## AdaBoost

In [None]:
# ada = AdaBoostClassifier(n_estimators=30)
# ada.fit(X_train_rs, y_train_rs)
# ada_pred = ada.predict(X_test)

# print("Cross val score: {}".format(cross_val_score(ada, X_train_rs, y_train_rs, cv=3).mean()))
# print("Test score: {}".format(ada.score(X_test, y_test)))
# print(confusion_matrix(y_test, ada_pred))

# print("AUC: {}".format(get_auc(y_test, ada_pred)))
# print(classification_report(y_test, ada_pred))

## Text + Images

In [None]:
df_images = pd.read_csv('text_and_images.csv')
print(len(df_images))

article_text = df_images['article_text'].apply(lambda x: re.sub(r'http\S+', '', str(x)))
text = article_text.apply(lambda x: strip_short(remove_stopwords(strip_numeric(
                            strip_non_alphanum(x.lower()))),minsize=3))

In [None]:
tfidf = TfidfVectorizer(max_features=2500)
vectors = tfidf.fit_transform(text)
encoder = preprocessing.LabelEncoder()
labels = encoder.fit_transform(df_images.norm_score)
vocab = dict(zip(tfidf.vocabulary_.values(),tfidf.vocabulary_.keys()))

images_data = df_images.drop(['article_title', 'article_text', 'url', 'norm_score'], axis=1)

In [None]:
sparse = pd.SparseDataFrame(vectors, columns=tfidf.get_feature_names(), default_fill_value=0)
combined = images_data.join(sparse)
text_idx = sparse.shape[1]

X_train, X_test, y_train, y_test = train_test_split(combined, labels, test_size=0.1, random_state=42)
X_train_rs, y_train_rs = ADASYN(n_jobs=4, random_state=42).fit_resample(X_train, y_train)

text_features_train = X_train_rs[:,-text_idx:]
image_features_train = X_train_rs[:,:-text_idx]

text_features_test = X_test[list(sparse.columns)]
image_features_test = X_test.drop(list(sparse.columns), axis=1)

print(y_train_rs.sum(axis=0))
print(y_test.sum(axis=0))

In [None]:
boosted_text = XGBClassifier(nthread=12,reg_lambda=0.75,max_depth=5)
svc_text = LinearSVC(max_iter=2000, dual=False, random_state=1, C=0.25)

In [None]:
boosted_text.fit(text_features_train, y_train_rs)
pred_text = boosted_text.predict(text_features_test.values)

print("Cross val score: {}".format(cross_val_score(boosted_text, text_features_train, y_train_rs,cv=5).mean()))
print("Test score: {0:.2f}".format(boosted_text.score(text_features_test.values, y_test)))
print("AUC: {0:.2f}".format(get_auc(y_test,pred_text)))
print(confusion_matrix(y_test, pred_text))

In [None]:
svc_text.fit(text_features_train, y_train_rs)
pred_text = svc_text.predict(text_features_test.values)

print("Cross val score: {}".format(cross_val_score(svc_text, text_features_train, y_train_rs,cv=5).mean()))
print("Test score: {0:.2f}".format(svc_text.score(text_features_test.values, y_test)))
print("AUC: {0:.2f}".format(get_auc(y_test,pred_text)))
print(confusion_matrix(y_test, pred_text))

## Combined

In [None]:
boosted_combined = XGBClassifier(nthread=12,reg_lambda=0.75,max_depth=5)
svc_combined = LinearSVC(max_iter=2000, dual=False, random_state=1, C=0.25)

In [None]:
boosted_combined.fit(X_train_rs,y_train_rs)
pred_combined = boosted_combined.predict(X_test.values)

print("Cross val score: {}".format(cross_val_score(boosted_combined, X_train_rs, y_train_rs,cv=5).mean()))
print("Test score: {0:.2f}".format(boosted_combined.score(X_test.values, y_test)))
print("AUC: {0:.2f}".format(get_auc(y_test,pred_combined)))
print(confusion_matrix(y_test, pred_combined))

In [None]:
svc_combined.fit(X_train_rs,y_train_rs)
pred_combined = svc_combined.predict(X_test.values)

print("Cross val score: {}".format(cross_val_score(svc_combined, X_train_rs, y_train_rs,cv=5).mean()))
print("Test score: {0:.2f}".format(svc_combined.score(X_test.values, y_test)))
print("AUC: {0:.2f}".format(get_auc(y_test,pred_combined)))
print(confusion_matrix(y_test, pred_combined))