# Baseline model - Logistic Regression

## Imports and Load Data

In [62]:

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import GridSearchCV
from scipy.stats import uniform

from sklearn.metrics import f1_score, auc, precision_recall_curve, average_precision_score
from sklearn.metrics import auc
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split,TimeSeriesSplit
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.ensemble import RandomForestClassifier 


from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

from sklearn import preprocessing


In [41]:
data = pd.read_csv('../data/utlysningar_preprocessed_stemming')

In [42]:
data

Unnamed: 0,Title En,Description En,Research fields
0,"Judith Jack Halberstam, Columbia University, N...",jack halberstam legal name judith legal gender...,social sciences
1,"Nomination of Professor Martin Shepperd, Brune...",professor martin john shepperd phd dept comput...,natural sciences
2,Characterisation and effects of micro and nano...,start recent develop new experiment techniqu c...,engineering and technology
3,Moral Vagueness in a Mind-Independent World,the main object project show central hypothesi...,humanities and the arts
4,Bearbetning av organosolvfraktioner för funkti...,the global demand renew degrad sustain materi ...,engineering and technology
...,...,...,...
7138,Spatial Omics Enable Improved Pathophysiology-...,we propos innov interdisciplinari research env...,medical and health sciences
7139,A New Interdisciplinary Framework for Studying...,the program aim establish bold interdisciplina...,social sciences
7140,SweDigArch – The Swedish National Infrastructu...,swedigarch construct run swedish nation infras...,social sciences
7141,Design after progress: reimagining design hist...,design tie industrialis progress made possibl ...,humanities and the arts


In [43]:
# Encode the labels

le = preprocessing.LabelEncoder()
le.fit(data['Research fields'])
data['Research fields'] = le.transform(data['Research fields'])

In [44]:
X_train,X_test,Y_train,Y_test = train_test_split(data['Description En'],data['Research fields'],test_size = 0.3,random_state=2)

## Feature Extraction & Embedding - Bag of Words

In [45]:
count_vect = CountVectorizer(ngram_range=(1,2))

X_train_bow = count_vect.fit_transform(X_train) 
X_test_bow = count_vect.transform(X_test)

In [46]:
X_train_bow = preprocessing.normalize(X_train_bow)
X_test_bow = preprocessing.normalize(X_test_bow)

# Load preprocessed data

In [47]:
import pickle

with open('/Users/evaengel/comparison_NLP_classification_models/data/preprocessed_for_logistic_regression/preprocessed_data.pkl', 'rb') as file:
    dataset_dict = pickle.load(file)

In [49]:
X_train, X_test, Y_train, Y_test = dataset_dict["X_train"], dataset_dict["X_test"], dataset_dict["Y_train"], dataset_dict["Y_test"]

In [58]:
ml_binarizer = MultiLabelBinarizer()

Y_train_encoded = ml_binarizer.fit_transform(Y_train)
Y_test_encoded = ml_binarizer.transform(Y_test)

In [78]:
Y_test_encoded.size

11550

In [80]:
Y_train_encoded.sum(axis = 1)


array([3, 1, 2, 3, 3, 2, 3, 2, 3, 1, 2, 2, 3, 1, 2, 1, 3, 2, 2, 2, 3, 1,
       2, 3, 2, 1, 1, 3, 2, 1, 2, 3, 1, 3, 3, 3, 1, 3, 2, 2, 3, 3, 1, 3,
       2, 2, 2, 2, 1, 1, 1, 2, 3, 2, 1, 1, 3, 1, 3, 3, 1, 2, 2, 2, 1, 3,
       3, 2, 3, 1, 2, 3, 2, 2, 3, 3, 2, 3, 1, 3, 2, 1, 2, 1, 3, 1, 3, 3,
       2, 3, 1, 2, 2, 3, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2,
       2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 3, 3, 2, 2, 2, 3, 3, 1, 3,
       2, 2, 3, 3, 1, 3, 3, 1, 2, 2, 1, 2, 1, 2, 3, 2, 3, 2, 3, 1, 1, 1,
       3, 2, 2, 3, 3, 1, 3, 3, 3, 1, 3, 2, 1, 3, 1, 2, 2, 3, 3, 3, 1, 1,
       2, 1, 3, 2, 1, 1, 2, 1, 3, 1, 2, 3, 3, 3, 3, 2, 3, 2, 1, 2, 2, 1,
       2, 2, 2, 2, 1, 1, 2, 3, 3, 2, 2, 2, 1, 1, 1, 2, 2, 3, 1, 1, 2, 1,
       1, 2, 3, 2, 1, 3, 1, 1, 1, 2, 2, 3, 3, 2, 1, 2, 1, 1, 1, 3, 3, 3,
       1, 1, 2, 1, 2, 3, 1, 3, 2, 2, 3, 2, 3, 1, 1, 2, 2, 1, 1, 2, 3, 1,
       3, 3, 2, 3, 2, 3, 2, 1, 2, 1, 1, 1, 2, 1, 1, 3, 1, 2, 1, 1, 1, 2,
       2, 1, 3, 2, 3, 1, 3, 1, 3, 2, 3, 2, 3, 2, 3,

In [74]:
import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)
print(Y_test_encoded)

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0
  0 0 0 1 0 0]
 [0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0]
 [0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
  1 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
  1 0 0 0 1 0]
 [0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0

## Fine tuning 

In [None]:
base_lr = LogisticRegression(solver='lbfgs', random_state=42)

In [60]:
# Logistic Regression based on MultiOutputClassifier
 
mo_clf = MultiOutputClassifier(base_lr)
# params = { 'estimator__C': [10**-4, 10**-2, 10**-1, 10**0, 10**1, 10**2, 10**4],
#               'estimator__penalty':['l1','l2']}
# randm_src = RandomizedSearchCV(clf, params,
#                       scoring = "f1"
#                      )

# randm_src.fit(X_train, Y_train_encoded)
mo_clf.fit(X_train, Y_train_encoded)

In [68]:
mo_clf.predict(X_test).sum(axis = 1)

array([0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 3, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       2, 0, 0, 2, 1, 0, 0, 1, 0, 1, 1, 2, 0, 0, 1, 0, 1, 1, 1, 0, 0, 2,
       1, 0, 0, 1, 1, 2, 0, 2, 0, 2, 2, 0, 1, 0, 0, 1, 0, 1, 0, 2, 1, 1,
       0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 3, 2, 0, 0, 1, 1, 0, 2, 0, 0, 1, 0, 2, 0, 1, 1, 2, 1,
       1, 0, 2, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 2, 0, 1, 1, 0, 1, 2, 2, 2, 1, 0, 1,
       1, 0, 1, 1, 1, 2, 1, 0, 1, 0, 1, 1, 1, 0, 2, 2, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 2, 1, 0, 1, 0, 1, 2, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 2, 0, 0, 0, 1, 0, 0, 2, 1, 1, 3, 1, 1, 1, 1,
       0, 1, 2, 1, 3, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 2, 0, 1, 0, 0, 2, 0, 1, 0, 2, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 2, 0, 0, 0, 1, 0])

In [64]:
print(mo_clf.score(X_test, Y_test_encoded))
metrics.f1_score(Y_test_encoded, mo_clf.predict(X_test), average= 'weighted')

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


0.3317330551472792

In [56]:
# Logistic Regression based on classifier chain
chain = ClassifierChain(base_lr, random_state=42)

chain.fit(X_train, Y_train_encoded)

In [69]:
print(chain.score(X_test, Y_test_encoded))
metrics.f1_score(Y_test_encoded, chain.predict(X_test), average= 'weighted')

0.17454545454545456


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


0.3329569416845551

In [70]:
# Random forest

rfc = RandomForestClassifier(random_state=42)

# param_grid = [
#     {'randomforestclassifier__n_estimators': [3, 10, 30], 
#      'randomforestclassifier__max_features': [2, 4, 5, 8],
#     'randomforestclassifier__bootstrap': [False], 
#     }
# ]
# grid_search = GridSearchCV(rfc, param_grid=param_grid, cv=5, scoring = 'f1_micro')
# grid_search.fit(X_train, Y_train_encoded)

rfc.fit(X_train, Y_train_encoded)
rfc.score(X_test, Y_test_encoded)

metrics.f1_score(Y_test_encoded, rfc.predict(X_test), average= 'weighted')

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


0.09712001955425088