# Dataset : News 20

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups_vectorized
from sklearn.model_selection import train_test_split

In [2]:
dataset = fetch_20newsgroups_vectorized('all')
X = dataset.data
y = dataset.target

# Shadow Model
## Softmax

In [3]:
shadow_X, target_X, shadow_y, target_y = train_test_split(X, y, random_state=42, test_size=0.5)
X_train, X_test, y_train, y_test = train_test_split(shadow_X, shadow_y, test_size=0.5, random_state=42)

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
softRegr=LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=3000, penalty='l2')
softRegr.fit(X_train, y_train)

pred_train=softRegr.predict(X_train)
pred_test=softRegr.predict(X_test)

train_acc=accuracy_score(y_train, pred_train)
test_acc=accuracy_score(y_test, pred_test)

print("Training Accuracy : {}, Testing Accuracy : {}".format(train_acc, test_acc))

Training Accuracy : 0.8520483973678624, Testing Accuracy : 0.7014006791171478


In [5]:
y_out=softRegr.predict_proba(X_test)
y_in=softRegr.predict_proba(X_train)

In [6]:
y_in_sorted=np.sort(y_in)
y_in_top3=y_in_sorted[:, 17:]
y_out_sorted=np.sort(y_out)
y_out_top3=y_out_sorted[:, 17:]

In [7]:
din=np.hstack((y_in_top3, np.ones((y_in_top3.shape[0],1))))
dout=np.hstack((y_out_top3, np.zeros((y_out_top3.shape[0],1))))
dt=np.vstack((din,dout))
df=pd.DataFrame(dt)
df.to_csv("sh_softmax.csv", index=False, header=True)

# Target Model
## Random Forest

In [8]:
X_train, X_test, y_train, y_test = train_test_split(target_X, target_y, test_size=0.5, random_state=42)

In [9]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier(n_estimators=100, max_features=50)

model.fit(X_train,y_train)

predict_train=model.predict(X_train)
predict_test = model.predict(X_test)

train_acc=accuracy_score(y_train, predict_train)
test_acc=accuracy_score(y_test, predict_test)
print("Training Accuracy : {}, Testing Accuracy : {}".format(train_acc, test_acc))

Training Accuracy : 1.0, Testing Accuracy : 0.7973259762308998


In [None]:
y_in=model.predict_proba(X_train)
y_out=model.predict_proba(X_test)

In [None]:
y_in_sorted=np.sort(y_in)
y_in_top3=y_in_sorted[:, 17:]
y_out_sorted=np.sort(y_out)
y_out_top3=y_out_sorted[:, 17:]

In [None]:
din=np.hstack((y_in_top3, np.ones((y_in_top3.shape[0],1))))
dout=np.hstack((y_out_top3, np.zeros((y_out_top3.shape[0],1))))
dt=np.vstack((din,dout))
df=pd.DataFrame(dt)
df.to_csv("sh_forest.csv", index=False, header=True)