In [1]:
import pandas as pd
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
from sklearn.svm import SVC,LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score,train_test_split,GridSearchCV
from sklearn.metrics import roc_auc_score,confusion_matrix,accuracy_score,f1_score,precision_score,recall_score,precision_recall_fscore_support,classification_report
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import seaborn as sns
import pickle
import json
%matplotlib inline
stopwords = stopwords.words('english')

# LOAD MODELS

In [2]:
MODEL1 = 'C_logistic_model.sav'
MODEL2 = 'C_svc_model.sav'
MODEL3 = 'C_knn_model.sav'
logistic_model = pickle.load(open("../models/"+MODEL1, 'rb'))
svc_model = pickle.load(open("../models/"+MODEL2, 'rb'))
knn_model = pickle.load(open("../models/"+MODEL3, 'rb'))

# LOAD TEST DATA

In [3]:
json_file = '../data/Sabinet_R_no_labels.json'
    
with open(json_file, 'r') as handle:
    json_data = [json.loads(line) for line in handle]
    
json_text = [i['text'] for i in json_data]

In [4]:
all_df = pd.DataFrame(json_data)

In [6]:
all_df.to_csv('../data/test.csv')

In [326]:
df = pd.DataFrame(json_text,columns=['text'])

In [327]:
def preprocess(news):
    l = WordNetLemmatizer()
    sentences = news.split(".")
    return " ".join([l.lemmatize(word.lower()) for sentence in sentences for word in sentence.split() \
                     if word not in stopwords if word.isalpha() if len(word)> 2 \
                     if word.lower() not in ["said","the","first","also","would","one","two","they"]])

In [328]:
df['text'] = df['text'].map(preprocess)

# GET RESULTS

In [345]:
l = logistic_model.predict(df.text)

s = svc_model.predict(df.text)

k = knn_model.predict(df.text)

In [346]:
e = []
for i in s+l+k:
    if i > 1.5:
        e.append(1)
    else:
        e.append(0)

In [351]:
df['log_labels'] = l

df['svc_labels'] = s

df['knn_labels'] = k

df['ens_labels'] = e

In [352]:
df.log_labels.value_counts()

0    1004
1      46
Name: log_labels, dtype: int64

In [353]:
df.svc_labels.value_counts()

0    963
1     87
Name: svc_labels, dtype: int64

In [354]:
df.knn_labels.value_counts()

0    934
1    116
Name: knn_labels, dtype: int64

In [356]:
df.ens_labels.value_counts()

0    988
1     62
Name: ens_labels, dtype: int64

In [364]:
final = pd.concat([all_df,df[['svc_labels','ens_labels']]],axis=1)

In [365]:
final.to_csv('../data/fatih_results.csv')