##### Feature extraction 

In [2]:
import pandas as pd
X_train = pd.read_csv("/code/data/train.csv")['headline']
y_train = pd.read_csv("/code/data/train.csv")['is_sarcastic']

X_val = pd.read_csv("/code/data/val.csv")['headline']
y_val = pd.read_csv("/code/data/val.csv")['is_sarcastic']

In [3]:
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

tfidf_train = tfidf_vectorizer.fit_transform(X_train) 
tfidf_val = tfidf_vectorizer.transform(X_val)


## Classifiers 

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB

clf1 = LogisticRegression(multi_class='multinomial', random_state=1,  solver='newton-cg')
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = svm.SVC(gamma='scale', probability=True)
clf4 = MultinomialNB() 


## Generate combinations

In [5]:
import itertools
def perm(n, seq):
    comb=[]
    for p in itertools.product(seq, repeat=n):
        comb.append(p)
    return comb


## Grid Search

In [6]:
from sklearn.ensemble import VotingClassifier
from itertools import combinations_with_replacement
from sklearn import metrics
import numpy as np

In [None]:
results = [[0],[0,0,0,0]]
comb=perm(4, [0,1,2,3,4])
for i, combination in enumerate(comb[1:]):
    print(i,combination)

    eclf = VotingClassifier(estimators=[('clf1', clf1),('clf2', clf2),('clf3', clf3), ('clf4', clf4)],
                            n_jobs=1, voting='soft', weights=combination)
    
    eclf = eclf.fit(tfidf_train, y_train)
    pred3 = eclf.predict(tfidf_val)
    result = metrics.accuracy_score(y_val, pred3)
    print(result)
    results[0].append(result)
    results[1].append(combination)
    
pd.DataFrame(np.asarray(results)).to_csv("/code/data/grid_search_results.csv")

0 (0, 0, 0, 1)
0.8322851153039832
1 (0, 0, 0, 2)


## Benchmark best model 

In [7]:
from sklearn.metrics import classification_report
eclf = VotingClassifier(estimators=[('clf1', clf1),('clf2', clf2),('clf3', clf3), ('clf4', clf4)],
                            n_jobs=1, voting='soft', weights=[1,1,2,4])
eclf = eclf.fit(tfidf_train, y_train)

In [11]:
prediction = eclf.predict(tfidf_val)
metrics.accuracy_score(y_val, prediction)

0.8625669694852085

In [9]:
print(classification_report(y_val, prediction, target_names=['real', 'fake']))

              precision    recall  f1-score   support

        real       0.87      0.88      0.87      2290
        fake       0.86      0.85      0.85      2003

    accuracy                           0.86      4293
   macro avg       0.86      0.86      0.86      4293
weighted avg       0.86      0.86      0.86      4293



## Save model

In [12]:
import pickle

with open('/code/models/model.1.0.0.pickle', 'wb') as file:
    pickle.dump(eclf, file)